Source code for scaffold_kit.utils.pattern_processor

"""Converts glob-like patterns to regular expressions.

This module provides classes for processing .gitignore-style glob patterns
and converting them into equivalent regular expressions. It uses a
handler-based, "strategy" pattern to process different types of characters
(e.g., wildcards, character classes, literals) and handles complex rules
like recursive wildcards and root-anchored patterns.

Demo:
    To run the module's demonstration code, use the following command:

    $ uv run python -m scaffold_kit.utils.pattern_processor
"""

from __future__ import annotations

import re

from abc import ABC, abstractmethod
from typing import Tuple


[docs] class CharacterHandler(ABC): """Abstract base class for character handlers. Character handlers define the logic for converting a specific type of pattern character into its regex equivalent. """
[docs] @abstractmethod def can_handle(self, char: str) -> bool: """Checks if this handler can process the given character. Args: char: The single character to check. Returns: True if the handler can process the character, False otherwise. """ pass
[docs] @abstractmethod def handle(self, text: str, position: int) -> Tuple[str, int]: """Handles the character at the given position. Args: text: The full text being processed. position: Current position in the text. Returns: A tuple containing: - The replacement string for the character(s). - The new position in the text after processing. """ pass
[docs] class WildcardHandler(CharacterHandler): """Handles '*' wildcard characters. Converts a single '*' glob character into its regex equivalent. """
[docs] def can_handle(self, char: str) -> bool: """Checks if the character is a '*'. Args: char: The single character to check. Returns: True if the character is a wildcard, False otherwise. """ return char == "*"
[docs] def handle(self, text: str, position: int) -> Tuple[str, int]: """Converts '*' to '[^/]*'. Args: text: The full text being processed. position: Current position in the text. Returns: A tuple of the replacement regex and the new position. """ return "[^/]*", position + 1
[docs] class SingleCharHandler(CharacterHandler): """Handles '?' single character wildcards. Converts a single '?' glob character into its regex equivalent. """
[docs] def can_handle(self, char: str) -> bool: """Checks if the character is a '?'. Args: char: The single character to check. Returns: True if the character is a single-char wildcard, False otherwise. """ return char == "?"
[docs] def handle(self, text: str, position: int) -> Tuple[str, int]: """Converts '?' to '[^/]'. Args: text: The full text being processed. position: Current position in the text. Returns: A tuple of the replacement regex and the new position. """ return "[^/]", position + 1
[docs] class CharacterClassHandler(CharacterHandler): """Handles '[...]' character classes. Captures the entire character class including its content and closing bracket. """
[docs] def can_handle(self, char: str) -> bool: """Checks if the character is a '['. Args: char: The single character to check. Returns: True if the character is a character class, False otherwise. """ return char == "["
[docs] def handle(self, text: str, position: int) -> Tuple[str, int]: """Extracts the entire character class from the text. Args: text: The full text being processed. position: Current position in the text. Returns: A tuple containing: - The regex string for the character class. - The new position in the text after processing. """ start = position i = position + 1 # Skip opening '['. # Handle negation characters. if i < len(text) and text[i] in ("!", "^"): i += 1 # Handle immediate closing bracket. if i < len(text) and text[i] == "]": i += 1 # Find the closing bracket. while i < len(text) and text[i] != "]": i += 1 if i < len(text): # Found closing bracket. return text[start : i + 1], i + 1 # No closing bracket found, treat as literal. return re.escape("["), position + 1
[docs] class LiteralCharHandler(CharacterHandler): """Handles literal characters (default handler). Converts a literal character to a regex-escaped string. """
[docs] def can_handle(self, char: str) -> bool: """Checks if this is the fallback handler. This is the fallback handler, so it can handle any character. Args: char: The single character to check. Returns: True. """ return True
[docs] def handle(self, text: str, position: int) -> Tuple[str, int]: """Escapes a single literal character for regex. Args: text: The full text being processed. position: Current position in the text. Returns: A tuple of the escaped character and the new position. """ return re.escape(text[position]), position + 1
[docs] class GlobProcessor: """Processes glob patterns using the strategy pattern. This class iterates through a glob string, applying the appropriate CharacterHandler to each character to build a regex string part. """ def __init__(self): """Initializes the GlobProcessor with a list of handlers. Note that the order of the handlers is crucial. More specific handlers (e.g., wildcards, character classes) must come before the generic fallback handler (LiteralCharHandler). """ # Order matters! More specific handlers should come first. self.handlers = [ WildcardHandler(), SingleCharHandler(), CharacterClassHandler(), LiteralCharHandler(), # Fallback handler - must be last. ]
[docs] def convert_glob_part(self, part: str) -> str: """Converts a single glob part to regex using character handlers. Args: part: A single string part of a glob pattern (e.g., 'path', '*', '**'). Returns: The regex equivalent of the glob part. """ # 1. Handle recursive wildcard special case. if part == "**": return ".*" result = "" position = 0 while position < len(part): char = part[position] # 2. Find the first handler that can process this character. handler = self._find_handler(char) replacement, new_position = handler.handle(part, position) result += replacement position = new_position return result
def _find_handler(self, char: str) -> CharacterHandler: """Finds the appropriate handler for the given character. Args: char: The single character to find a handler for. Returns: The first matching `CharacterHandler` instance. Raises: RuntimeError: If no handler is found for the given character. This should not happen if LiteralCharHandler is present. """ for handler in self.handlers: if handler.can_handle(char): return handler # This should never happen since LiteralCharHandler handles everything. raise RuntimeError(f"No handler found for character: {char}")
[docs] class PatternProcessor: """Main class for converting glob patterns to regex. This class orchestrates the entire conversion process, handling normalization, splitting, and joining of the regex parts. """ def __init__(self): """Initializes the processor with a GlobProcessor instance.""" self.glob_processor = GlobProcessor()
[docs] def pattern_to_regex(self, pattern: str) -> str: """Converts a .gitignore-style glob pattern to a regex. Args: pattern: The glob pattern string to convert. Returns: The complete, anchored regular expression string. """ # 1. Normalize the pattern. normalized_pattern = self._normalize_pattern(pattern) # 2. Split into parts and convert each part. parts = normalized_pattern.split("/") regex_parts = [ self.glob_processor.convert_glob_part(part) for part in parts ] # 3. Join the parts with appropriate separators. joined_regex = self._join_regex_parts(regex_parts) # 4. Add anchors. return f"^{joined_regex}$"
def _normalize_pattern(self, pattern: str) -> str: """Applies initial pattern transformations. Args: pattern: The glob pattern string. Returns: The normalized pattern string. """ # If a pattern has no slashes, it is treated as if it were # preceded by '**/'. if "/" not in pattern: pattern = f"**/{pattern}" # If a pattern starts with a slash, it is anchored to the project root. if pattern.startswith("/"): pattern = pattern[1:] return pattern def _join_regex_parts(self, regex_parts: list[str]) -> str: """Joins regex parts with appropriate separators. Args: regex_parts: A list of regex strings to join. Returns: The joined regex string. """ if not regex_parts: return "" result = regex_parts[0] for i in range(1, len(regex_parts)): prev_part = regex_parts[i - 1] curr_part = regex_parts[i] # Add separator unless dealing with '.*' parts. if prev_part != ".*" and curr_part != ".*": result += "/" result += curr_part return result
if __name__ == "__main__": processor = PatternProcessor() pattern_nested_list = [ ["*.py", "Simple wildcard"], ["test?.txt", "Single char wildcard"], ["**/*.log", "Recursive wildcard"], ["build/", "Directory pattern"], ["/root-only", "Root-anchored pattern"], ["test[0-9].txt", "Character class"], ["file[!a-c].dat", "Negated character class"], ["path/*/sub", "Wildcard in middle"], ["no-special-chars", "Literal pattern"], ["a*b?c[def]g", "Multiple wildcards"], ["**", "Just recursive wildcard"], ["[abc]", "Just character class"], ["[]broken", "Malformed character class"], ] pattern_list = [item[0] for item in pattern_nested_list] description_list = [item[1] for item in pattern_nested_list] regex_list = [ [pattern, processor.pattern_to_regex(pattern)] for pattern in pattern_list ] print("\n=== pattern_to_regex ===\n") for key, value in regex_list: print(f"{key}: {value}")