Source code for scaffold_kit.utils.pattern_processor

"""Converts glob-like patterns to regular expressions.

This module provides classes for processing .gitignore-style glob patterns
and converting them into equivalent regular expressions. It uses a
handler-based, "strategy" pattern to process different types of characters
(e.g., wildcards, character classes, literals) and handles complex rules
like recursive wildcards and root-anchored patterns.

Demo:
    To run the module's demonstration code, use the following command:

    $ uv run python -m scaffold_kit.utils.pattern_processor
"""

from __future__ import annotations

import re

from abc import ABC, abstractmethod
from typing import Tuple



[docs]
class CharacterHandler(ABC):
    """Abstract base class for character handlers.

    Character handlers define the logic for converting a specific type of
    pattern character into its regex equivalent.
    """


[docs]
    @abstractmethod
    def can_handle(self, char: str) -> bool:
        """Checks if this handler can process the given character.

        Args:
            char: The single character to check.

        Returns:
            True if the handler can process the character, False otherwise.
        """
        pass



[docs]
    @abstractmethod
    def handle(self, text: str, position: int) -> Tuple[str, int]:
        """Handles the character at the given position.

        Args:
            text: The full text being processed.
            position: Current position in the text.

        Returns:
            A tuple containing:
                - The replacement string for the character(s).
                - The new position in the text after processing.
        """
        pass





[docs]
class WildcardHandler(CharacterHandler):
    """Handles '*' wildcard characters.

    Converts a single '*' glob character into its regex equivalent.
    """


[docs]
    def can_handle(self, char: str) -> bool:
        """Checks if the character is a '*'.

        Args:
            char: The single character to check.

        Returns:
            True if the character is a wildcard, False otherwise.
        """
        return char == "*"



[docs]
    def handle(self, text: str, position: int) -> Tuple[str, int]:
        """Converts '*' to '[^/]*'.

        Args:
            text: The full text being processed.
            position: Current position in the text.

        Returns:
            A tuple of the replacement regex and the new position.
        """
        return "[^/]*", position + 1





[docs]
class SingleCharHandler(CharacterHandler):
    """Handles '?' single character wildcards.

    Converts a single '?' glob character into its regex equivalent.
    """


[docs]
    def can_handle(self, char: str) -> bool:
        """Checks if the character is a '?'.

        Args:
            char: The single character to check.

        Returns:
            True if the character is a single-char wildcard, False otherwise.
        """
        return char == "?"



[docs]
    def handle(self, text: str, position: int) -> Tuple[str, int]:
        """Converts '?' to '[^/]'.

        Args:
            text: The full text being processed.
            position: Current position in the text.

        Returns:
            A tuple of the replacement regex and the new position.
        """
        return "[^/]", position + 1





[docs]
class CharacterClassHandler(CharacterHandler):
    """Handles '[...]' character classes.

    Captures the entire character class including its content and closing
    bracket.
    """


[docs]
    def can_handle(self, char: str) -> bool:
        """Checks if the character is a '['.

        Args:
            char: The single character to check.

        Returns:
            True if the character is a character class, False otherwise.
        """
        return char == "["



[docs]
    def handle(self, text: str, position: int) -> Tuple[str, int]:
        """Extracts the entire character class from the text.

        Args:
            text: The full text being processed.
            position: Current position in the text.

        Returns:
            A tuple containing:
                - The regex string for the character class.
                - The new position in the text after processing.
        """
        start = position
        i = position + 1  # Skip opening '['.

        # Handle negation characters.
        if i < len(text) and text[i] in ("!", "^"):
            i += 1

        # Handle immediate closing bracket.
        if i < len(text) and text[i] == "]":
            i += 1

        # Find the closing bracket.
        while i < len(text) and text[i] != "]":
            i += 1

        if i < len(text):  # Found closing bracket.
            return text[start : i + 1], i + 1
        # No closing bracket found, treat as literal.
        return re.escape("["), position + 1





[docs]
class LiteralCharHandler(CharacterHandler):
    """Handles literal characters (default handler).

    Converts a literal character to a regex-escaped string.
    """


[docs]
    def can_handle(self, char: str) -> bool:
        """Checks if this is the fallback handler.

        This is the fallback handler, so it can handle any character.

        Args:
            char: The single character to check.

        Returns:
            True.
        """
        return True



[docs]
    def handle(self, text: str, position: int) -> Tuple[str, int]:
        """Escapes a single literal character for regex.

        Args:
            text: The full text being processed.
            position: Current position in the text.

        Returns:
            A tuple of the escaped character and the new position.
        """
        return re.escape(text[position]), position + 1





[docs]
class GlobProcessor:
    """Processes glob patterns using the strategy pattern.

    This class iterates through a glob string, applying the appropriate
    CharacterHandler to each character to build a regex string part.
    """

    def __init__(self):
        """Initializes the GlobProcessor with a list of handlers.

        Note that the order of the handlers is crucial. More specific handlers
        (e.g., wildcards, character classes) must come before the generic
        fallback handler (LiteralCharHandler).
        """
        # Order matters! More specific handlers should come first.
        self.handlers = [
            WildcardHandler(),
            SingleCharHandler(),
            CharacterClassHandler(),
            LiteralCharHandler(),  # Fallback handler - must be last.
        ]


[docs]
    def convert_glob_part(self, part: str) -> str:
        """Converts a single glob part to regex using character handlers.

        Args:
            part: A single string part of a glob pattern
                (e.g., 'path', '*', '**').

        Returns:
            The regex equivalent of the glob part.
        """
        # 1. Handle recursive wildcard special case.
        if part == "**":
            return ".*"

        result = ""
        position = 0

        while position < len(part):
            char = part[position]

            # 2. Find the first handler that can process this character.
            handler = self._find_handler(char)
            replacement, new_position = handler.handle(part, position)

            result += replacement
            position = new_position

        return result


    def _find_handler(self, char: str) -> CharacterHandler:
        """Finds the appropriate handler for the given character.

        Args:
            char: The single character to find a handler for.

        Returns:
            The first matching `CharacterHandler` instance.

        Raises:
            RuntimeError: If no handler is found for the given character.
                This should not happen if LiteralCharHandler is present.
        """
        for handler in self.handlers:
            if handler.can_handle(char):
                return handler

        # This should never happen since LiteralCharHandler handles everything.
        raise RuntimeError(f"No handler found for character: {char}")




[docs]
class PatternProcessor:
    """Main class for converting glob patterns to regex.

    This class orchestrates the entire conversion process, handling
    normalization, splitting, and joining of the regex parts.
    """

    def __init__(self):
        """Initializes the processor with a GlobProcessor instance."""
        self.glob_processor = GlobProcessor()


[docs]
    def pattern_to_regex(self, pattern: str) -> str:
        """Converts a .gitignore-style glob pattern to a regex.

        Args:
            pattern: The glob pattern string to convert.

        Returns:
            The complete, anchored regular expression string.
        """
        # 1. Normalize the pattern.
        normalized_pattern = self._normalize_pattern(pattern)

        # 2. Split into parts and convert each part.
        parts = normalized_pattern.split("/")
        regex_parts = [
            self.glob_processor.convert_glob_part(part) for part in parts
        ]

        # 3. Join the parts with appropriate separators.
        joined_regex = self._join_regex_parts(regex_parts)

        # 4. Add anchors.
        return f"^{joined_regex}$"


    def _normalize_pattern(self, pattern: str) -> str:
        """Applies initial pattern transformations.

        Args:
            pattern: The glob pattern string.

        Returns:
            The normalized pattern string.
        """
        # If a pattern has no slashes, it is treated as if it were
        # preceded by '**/'.
        if "/" not in pattern:
            pattern = f"**/{pattern}"

        # If a pattern starts with a slash, it is anchored to the project root.
        if pattern.startswith("/"):
            pattern = pattern[1:]

        return pattern

    def _join_regex_parts(self, regex_parts: list[str]) -> str:
        """Joins regex parts with appropriate separators.

        Args:
            regex_parts: A list of regex strings to join.

        Returns:
            The joined regex string.
        """
        if not regex_parts:
            return ""

        result = regex_parts[0]

        for i in range(1, len(regex_parts)):
            prev_part = regex_parts[i - 1]
            curr_part = regex_parts[i]

            # Add separator unless dealing with '.*' parts.
            if prev_part != ".*" and curr_part != ".*":
                result += "/"

            result += curr_part

        return result



if __name__ == "__main__":
    processor = PatternProcessor()

    pattern_nested_list = [
        ["*.py", "Simple wildcard"],
        ["test?.txt", "Single char wildcard"],
        ["**/*.log", "Recursive wildcard"],
        ["build/", "Directory pattern"],
        ["/root-only", "Root-anchored pattern"],
        ["test[0-9].txt", "Character class"],
        ["file[!a-c].dat", "Negated character class"],
        ["path/*/sub", "Wildcard in middle"],
        ["no-special-chars", "Literal pattern"],
        ["a*b?c[def]g", "Multiple wildcards"],
        ["**", "Just recursive wildcard"],
        ["[abc]", "Just character class"],
        ["[]broken", "Malformed character class"],
    ]

    pattern_list = [item[0] for item in pattern_nested_list]
    description_list = [item[1] for item in pattern_nested_list]
    regex_list = [
        [pattern, processor.pattern_to_regex(pattern)]
        for pattern in pattern_list
    ]

    print("\n=== pattern_to_regex ===\n")
    for key, value in regex_list:
        print(f"{key}: {value}")
Source code for scaffold_kit.utils.pattern_processor

Scaffold Kit

Navigation

Related Topics