"""A set of utilities for string manipulation.
This module provides functions for transliterating unicode characters and
creating URL-friendly "slugs" from text.
Demo:
To run the module's demonstration code, use the following command:
$ uv run python -m scaffold_kit.utils.string_utils
"""
from __future__ import annotations
import re
import unicodedata
DIACRITICS_MAP: dict[str, str] = {
"À": "A",
"Á": "A",
"Ã": "A",
"Ä": "Ae",
"Å": "A",
"Ā": "A",
"Ă": "A",
"Ą": "A",
"à": "a",
"á": "a",
"ã": "a",
"ä": "ae",
"å": "a",
"ā": "a",
"ă": "a",
"ą": "a",
"Ç": "C",
"Ć": "C",
"Ĉ": "C",
"Č": "C",
"ç": "c",
"ć": "c",
"ĉ": "c",
"č": "c",
"Ď": "D",
"Đ": "D",
"ď": "d",
"đ": "d",
"È": "E",
"É": "E",
"Ẽ": "E",
"Ë": "E",
"Ĕ": "E",
"Ē": "E",
"Ě": "E",
"Ę": "E",
"è": "e",
"é": "e",
"ẽ": "e",
"ë": "e",
"ĕ": "e",
"ė": "e",
"ě": "e",
"ę": "e",
"Ġ": "G",
"Ģ": "G",
"Ĝ": "G",
"Ğ": "G",
"ġ": "g",
"ģ": "g",
"ĝ": "g",
"ğ": "g",
"Ĥ": "H",
"Ħ": "H",
"ĥ": "h",
"ħ": "h",
"Ì": "I",
"Í": "I",
"Î": "I",
"Ï": "I",
"Į": "I",
"Ī": "I",
"İ": "I",
"ì": "i",
"í": "i",
"î": "i",
"ï": "i",
"ī": "i",
"ĩ": "i",
"Ĵ": "J",
"ĵ": "j",
"Ķ": "K",
"ķ": "k",
"Ĺ": "L",
"Ļ": "L",
"Ľ": "L",
"Ŀ": "L",
"ĺ": "l",
"ļ": "l",
"ľ": "l",
"Ñ": "N",
"Ņ": "N",
"Ň": "N",
"ņ": "n",
"ň": "n",
"Ò": "O",
"Ó": "O",
"Ô": "O",
"Õ": "O",
"Ö": "Oe",
"Ō": "O",
"Ŏ": "O",
"Ő": "O",
"ò": "o",
"ó": "o",
"ô": "o",
"õ": "o",
"ö": "oe",
"ō": "o",
"ŏ": "o",
"ő": "o",
"Ù": "U",
"Ú": "U",
"Û": "U",
"Ü": "Ue",
"Ū": "U",
"Ů": "U",
"Ű": "U",
"Ų": "U",
"ù": "u",
"ú": "u",
"û": "u",
"ü": "ue",
"ū": "u",
"ů": "u",
"ű": "u",
"Ŵ": "W",
"ŵ": "w",
"Ý": "Y",
"Ÿ": "Y",
"ý": "y",
"ÿ": "y",
"Ŷ": "Y",
"ŷ": "y",
"Ž": "Z",
"Ż": "Z",
"ź": "z",
"ż": "z",
"ž": "z",
}
"""Constant signifying diacritics map.""" # pylint: disable=W0105
LIGATURES_MAP: dict[str, str] = {
"æ": "ae",
"Æ": "Ae",
"œ": "oe",
"Œ": "Oe",
"ß": "ss",
"ff": "ff",
"fi": "fi",
"fl": "fl",
"ffi": "ffi",
"ffl": "ffl",
"ſt": "ft",
"st": "st",
"ij": "ij",
"IJ": "Ij",
"ʒ": "ezh",
"Ʒ": "Ez",
}
"""Constant signifying ligatures map.""" # pylint: disable=W0105
TRANSLITERATE_MAP = {**DIACRITICS_MAP, **LIGATURES_MAP}
"""Constant signifying transliterate map (diacritics and ligatures merged).""" # pylint: disable=W0105
[docs]
def transliterate(text: str) -> str:
"""Transliterates unicode characters to their closest ascii replacements.
This function replaces diacritics, ligatures, and stylistic variants with
base ASCII letters, e.g., 'ñ' → 'n', 'æ' → 'ae', 'ß' → 'ss'. All remaining
non-ASCII characters are removed by a second decomposing and encoding
pass.
Args:
text: Any string containing unicode characters.
Returns:
A plain ASCII string where every non-ASCII glyph has been converted or
dropped, resulting in lossy but url-safe output.
Raises:
None – all standard exceptions are caught internally.
Examples:
Handling diacritics:
>>> transliterate("François Café")
'Francois Cafe'
Mixed scripts and special characters:
>>> transliterate("Straße – café naïf")
'Strasse cafe naif '
Ligatures and stylists variants:
>>> transliterate("Encyclopædia & fluffy œuf")
'Encyclopaedia & fluffy oeu'
Emojis and math get stripped:
>>> transliterate("α ≤ ½ 😊")
' ' # empty string, every char is non-ASCII
"""
# 1. Map predefined characters to their ASCII replacements.
text = text.translate(str.maketrans(TRANSLITERATE_MAP))
# 2. Normalize and remove all remaining non-ASCII characters.
text = (
unicodedata.normalize("NFKD", text)
.encode("ascii", "ignore")
.decode("utf-8")
)
return text
[docs]
def slugify(text: str) -> str:
"""Converts a given string into an url-safe, ascii-only slug.
This function removes or transliterates diacritics, ligatures, and other
non-ascii characters while normalising whitespace and punctuation into
hyphens. The result contains only lowercase letters ([a-z]), digits
([0-9]) and hyphens, making it suitable for use in urls, file names or keys.
Args:
text: The original, possibly unicode string that needs to be slugified.
Returns:
A hyphen-separated ascii slug derived from `text`. If `text` is empty or
the transformation leads to an empty string the returned slug will also
be empty ("").
Raises:
None – all standard exceptions are caught internally.
Examples:
Basic usage:
>>> slugify("Café crème à la française")
'cafe-creme-a-la-francaise'
Complex input with punctuation and mixed spaces:
>>> slugify(" ¡Hola! ¿Qué tal? ")
'hola-que-tal'
Already ascii and clean strings remain the same, except for case:
>>> slugify("Valid-slug-already-given")
'valid-slug-already-given'
Empty or symbol-only input results in an empty string:
>>> slugify("!!!!! ???")
''
"""
# 1. Replace diacritics and ligatures.
text = transliterate(text)
# 2. Convert to lowercase and remove leading/trailing spaces.
text = text.lower().strip()
# 3. Remove non-alphanumeric characters except spaces and hyphens.
text = re.sub(r"[^\w\s-]", "", text)
# 4. Replace consecutive spaces or hyphens with a single hyphen.
text = re.sub(r"[\s_-]+", "-", text)
# 5. Remove leading/trailing hyphens.
text = re.sub(r"^-+|-+$", "", text)
return text
if __name__ == "__main__":
text_list = [
"Hëllô, wörld!",
"ça va être une journée spéciale.",
"Ich liebe Deutsche Küche!",
"¿Cómo estás? ¡Hasta mañana!",
"Život je lijep, ali često i kratak.",
"Não sei o que fazer agora...",
"J'aime les fromages français!",
"Küß mich, meine Schöne!",
"Škoda, že jsem to nevím.",
"¿Dónde está el baño, por favor?",
"Encyclopædia & fluffy œuf",
"Straße – café naïf",
]
transliterate_list = [[text, transliterate(text)] for text in text_list]
slugify_list = [[text, slugify(text)] for text in text_list]
print("\n=== transliterate ===\n")
for key, value in transliterate_list:
print(f"{key}: {value}")
print("\n=== slugify ===\n")
for key, value in slugify_list:
print(f"{key}: {value}")