Skip to content

script_detection

Script detection utilities for text analysis.

ScriptType

Bases: StrEnum

Script type enumeration for text analysis.

has_arabic

has_arabic(text)

Check if the text contains Arabic characters.

Covers: - Arabic (U+0600-U+06FF) - Arabic, Persian, Urdu - Arabic Supplement (U+0750-U+077F) - Additional Arabic letters

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
bool

True if Arabic characters are found, False otherwise

Examples:

>>> has_arabic("محمد عبده")
True
>>> has_arabic("Pink Floyd")
False
Source code in src/core/models/script_detection.py
def has_arabic(text: str) -> bool:
    """Check if the text contains Arabic characters.

    Covers:
    - Arabic (U+0600-U+06FF) - Arabic, Persian, Urdu
    - Arabic Supplement (U+0750-U+077F) - Additional Arabic letters

    Args:
        text: Text to analyze

    Returns:
        True if Arabic characters are found, False otherwise

    Examples:
        >>> has_arabic("محمد عبده")
        True
        >>> has_arabic("Pink Floyd")
        False

    """
    if not text:
        return False
    return any(
        "\u0600" <= c <= "\u06ff"  # Arabic
        or "\u0750" <= c <= "\u077f"  # Arabic Supplement
        for c in text
    )

has_chinese

has_chinese(text)

Check if the text contains Chinese characters.

Covers: - CJK Unified Ideographs (U+4E00-U+9FFF) - Han characters - CJK Extension A (U+3400-U+4DBF) - Additional Han characters

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
bool

True if Chinese characters are found, False otherwise

Examples:

>>> has_chinese("周杰伦")
True
>>> has_chinese("Pink Floyd")
False
Source code in src/core/models/script_detection.py
def has_chinese(text: str) -> bool:
    """Check if the text contains Chinese characters.

    Covers:
    - CJK Unified Ideographs (U+4E00-U+9FFF) - Han characters
    - CJK Extension A (U+3400-U+4DBF) - Additional Han characters

    Args:
        text: Text to analyze

    Returns:
        True if Chinese characters are found, False otherwise

    Examples:
        >>> has_chinese("周杰伦")
        True
        >>> has_chinese("Pink Floyd")
        False

    """
    if not text:
        return False
    return any(
        "\u4e00" <= c <= "\u9fff"  # CJK Unified Ideographs
        or "\u3400" <= c <= "\u4dbf"  # CJK Extension A
        for c in text
    )

has_cyrillic

has_cyrillic(text)

Check if the text contains Cyrillic characters.

Covers: - Basic Cyrillic (U+0400-U+04FF) - Russian, Ukrainian, Serbian, Bulgarian, etc. - Cyrillic Supplement (U+0500-U+052F) - Additional characters - Cyrillic Extended-A (U+2DE0-U+2DFF) - Historic letters - Cyrillic Extended-B (U+A640-U+A69F) - Additional historic

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
bool

True if Cyrillic characters are found, False otherwise

Examples:

>>> has_cyrillic("МУР")
True
>>> has_cyrillic("Pink Floyd")
False
>>> has_cyrillic("діти інженерів")
True
Source code in src/core/models/script_detection.py
def has_cyrillic(text: str) -> bool:
    """Check if the text contains Cyrillic characters.

    Covers:
    - Basic Cyrillic (U+0400-U+04FF) - Russian, Ukrainian, Serbian, Bulgarian, etc.
    - Cyrillic Supplement (U+0500-U+052F) - Additional characters
    - Cyrillic Extended-A (U+2DE0-U+2DFF) - Historic letters
    - Cyrillic Extended-B (U+A640-U+A69F) - Additional historic

    Args:
        text: Text to analyze

    Returns:
        True if Cyrillic characters are found, False otherwise

    Examples:
        >>> has_cyrillic("МУР")
        True
        >>> has_cyrillic("Pink Floyd")
        False
        >>> has_cyrillic("діти інженерів")
        True

    """
    if not text:
        return False
    return any(
        "\u0400" <= c <= "\u04ff"  # Basic Cyrillic
        or "\u0500" <= c <= "\u052f"  # Cyrillic Supplement
        or "\u2de0" <= c <= "\u2dff"  # Cyrillic Extended-A
        or "\ua640" <= c <= "\ua69f"  # Cyrillic Extended-B
        for c in text
    )

has_devanagari

has_devanagari(text)

Check if the text contains Devanagari characters.

Covers: - Devanagari (U+0900-U+097F) - Hindi, Marathi, Sanskrit, Nepali

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
bool

True if Devanagari characters are found, False otherwise

Examples:

>>> has_devanagari("हिन्दी संगीत")
True
>>> has_devanagari("Pink Floyd")
False
Source code in src/core/models/script_detection.py
def has_devanagari(text: str) -> bool:
    """Check if the text contains Devanagari characters.

    Covers:
    - Devanagari (U+0900-U+097F) - Hindi, Marathi, Sanskrit, Nepali

    Args:
        text: Text to analyze

    Returns:
        True if Devanagari characters are found, False otherwise

    Examples:
        >>> has_devanagari("हिन्दी संगीत")
        True
        >>> has_devanagari("Pink Floyd")
        False

    """
    return any("\u0900" <= c <= "\u097f" for c in text) if text else False

has_greek

has_greek(text)

Check if the text contains Greek characters.

Covers: - Greek and Coptic (U+0370-U+03FF) - Modern and ancient Greek

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
bool

True if Greek characters are found, False otherwise

Examples:

>>> has_greek("Μουσική")
True
>>> has_greek("Pink Floyd")
False
Source code in src/core/models/script_detection.py
def has_greek(text: str) -> bool:
    """Check if the text contains Greek characters.

    Covers:
    - Greek and Coptic (U+0370-U+03FF) - Modern and ancient Greek

    Args:
        text: Text to analyze

    Returns:
        True if Greek characters are found, False otherwise

    Examples:
        >>> has_greek("Μουσική")
        True
        >>> has_greek("Pink Floyd")
        False

    """
    return any("\u0370" <= c <= "\u03ff" for c in text) if text else False

has_hebrew

has_hebrew(text)

Check if the text contains Hebrew characters.

Covers: - Hebrew (U+0590-U+05FF) - Hebrew alphabet

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
bool

True if Hebrew characters are found, False otherwise

Examples:

>>> has_hebrew("מוזיקה עברית")
True
>>> has_hebrew("Pink Floyd")
False
Source code in src/core/models/script_detection.py
def has_hebrew(text: str) -> bool:
    """Check if the text contains Hebrew characters.

    Covers:
    - Hebrew (U+0590-U+05FF) - Hebrew alphabet

    Args:
        text: Text to analyze

    Returns:
        True if Hebrew characters are found, False otherwise

    Examples:
        >>> has_hebrew("מוזיקה עברית")
        True
        >>> has_hebrew("Pink Floyd")
        False

    """
    return any("\u0590" <= c <= "\u05ff" for c in text) if text else False

has_japanese

has_japanese(text)

Check if the text contains Japanese characters.

Covers: - Hiragana (U+3040-U+309F) - Japanese syllabary - Katakana (U+30A0-U+30FF) - Japanese syllabary - CJK Unified Ideographs (U+4E00-U+9FFF) - Kanji (shared with Chinese)

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
bool

True if Japanese characters are found, False otherwise

Examples:

>>> has_japanese("音楽")
True
>>> has_japanese("ひらがな")
True
>>> has_japanese("カタカナ")
True
>>> has_japanese("Pink Floyd")
False
Source code in src/core/models/script_detection.py
def has_japanese(text: str) -> bool:
    """Check if the text contains Japanese characters.

    Covers:
    - Hiragana (U+3040-U+309F) - Japanese syllabary
    - Katakana (U+30A0-U+30FF) - Japanese syllabary
    - CJK Unified Ideographs (U+4E00-U+9FFF) - Kanji (shared with Chinese)

    Args:
        text: Text to analyze

    Returns:
        True if Japanese characters are found, False otherwise

    Examples:
        >>> has_japanese("音楽")
        True
        >>> has_japanese("ひらがな")
        True
        >>> has_japanese("カタカナ")
        True
        >>> has_japanese("Pink Floyd")
        False

    """
    if not text:
        return False
    return any(
        "\u3040" <= c <= "\u309f"  # Hiragana
        or "\u30a0" <= c <= "\u30ff"  # Katakana
        or "\u4e00" <= c <= "\u9fff"  # CJK Unified Ideographs (Kanji)
        for c in text
    )

has_korean

has_korean(text)

Check if the text contains Korean characters.

Covers: - Hangul Syllables (U+AC00-U+D7AF) - Korean alphabet - Hangul Jamo (U+1100-U+11FF) - Korean alphabet components

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
bool

True if Korean characters are found, False otherwise

Examples:

>>> has_korean("한국 음악")
True
>>> has_korean("Pink Floyd")
False
Source code in src/core/models/script_detection.py
def has_korean(text: str) -> bool:
    """Check if the text contains Korean characters.

    Covers:
    - Hangul Syllables (U+AC00-U+D7AF) - Korean alphabet
    - Hangul Jamo (U+1100-U+11FF) - Korean alphabet components

    Args:
        text: Text to analyze

    Returns:
        True if Korean characters are found, False otherwise

    Examples:
        >>> has_korean("한국 음악")
        True
        >>> has_korean("Pink Floyd")
        False

    """
    if not text:
        return False
    return any(
        "\uac00" <= c <= "\ud7af"  # Hangul Syllables
        or "\u1100" <= c <= "\u11ff"  # Hangul Jamo
        for c in text
    )

has_latin

has_latin(text)

Check if the text contains Latin alphabetic characters.

Covers: - Basic Latin (U+0041-U+005A, U+0061-U+007A) - A-Z, a-z - Latin-1 Supplement (U+0080-U+00FF) - Accented letters - Latin Extended-A (U+0100-U+017F) - Eastern European - Latin Extended-B (U+0180-U+024F) - African languages, phonetic

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
bool

True if Latin alphabetic characters are found, False otherwise

Examples:

>>> has_latin("Pink Floyd")
True
>>> has_latin("Café")
True
>>> has_latin("МУР")  # noqa: RUF002
False
>>> has_latin("123")
False
>>> has_latin("!!!")
False
Source code in src/core/models/script_detection.py
def has_latin(text: str) -> bool:
    """Check if the text contains Latin alphabetic characters.

    Covers:
    - Basic Latin (U+0041-U+005A, U+0061-U+007A) - A-Z, a-z
    - Latin-1 Supplement (U+0080-U+00FF) - Accented letters
    - Latin Extended-A (U+0100-U+017F) - Eastern European
    - Latin Extended-B (U+0180-U+024F) - African languages, phonetic

    Args:
        text: Text to analyze

    Returns:
        True if Latin alphabetic characters are found, False otherwise

    Examples:
        >>> has_latin("Pink Floyd")
        True
        >>> has_latin("Café")
        True
        >>> has_latin("МУР")  # noqa: RUF002
        False
        >>> has_latin("123")
        False
        >>> has_latin("!!!")
        False

    """
    if not text:
        return False
    return any(
        c.isalpha()
        and (
            "\u0041" <= c <= "\u005a"  # A-Z
            or "\u0061" <= c <= "\u007a"  # a-z
            or "\u0080" <= c <= "\u00ff"  # Latin-1 Supplement
            or "\u0100" <= c <= "\u017f"  # Latin Extended-A
            or "\u0180" <= c <= "\u024f"  # Latin Extended-B
        )
        for c in text
    )

has_thai

has_thai(text)

Check if the text contains Thai characters.

Covers: - Thai (U+0E00-U+0E7F) - Thai alphabet

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
bool

True if Thai characters are found, False otherwise

Examples:

>>> has_thai("เพลงไทย")
True
>>> has_thai("Pink Floyd")
False
Source code in src/core/models/script_detection.py
def has_thai(text: str) -> bool:
    """Check if the text contains Thai characters.

    Covers:
    - Thai (U+0E00-U+0E7F) - Thai alphabet

    Args:
        text: Text to analyze

    Returns:
        True if Thai characters are found, False otherwise

    Examples:
        >>> has_thai("เพลงไทย")
        True
        >>> has_thai("Pink Floyd")
        False

    """
    return any("\u0e00" <= c <= "\u0e7f" for c in text) if text else False

get_all_scripts

get_all_scripts(text)

Get all scripts detected in text.

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
list[ScriptType]

List of detected script types

Source code in src/core/models/script_detection.py
def get_all_scripts(text: str) -> list[ScriptType]:
    """Get all scripts detected in text.

    Args:
        text: Text to analyze

    Returns:
        List of detected script types
    """
    if not text:
        return []
    return [script_type for script_type, detector in SCRIPT_DETECTORS.items() if detector(text)]

detect_primary_script

detect_primary_script(text)

Detect the primary script used in text.

Returns the most dominant script type found in the text. Uses character counting to determine dominance when multiple scripts are present.

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
ScriptType

Primary script type

Examples:

>>> detect_primary_script("МУР")  # noqa: RUF002
ScriptType.CYRILLIC
>>> detect_primary_script("Pink Floyd")
ScriptType.LATIN
>>> detect_primary_script("МУР featuring John")  # noqa: RUF002
ScriptType.MIXED
>>> detect_primary_script("音楽")
ScriptType.JAPANESE
Source code in src/core/models/script_detection.py
def detect_primary_script(text: str) -> ScriptType:
    """Detect the primary script used in text.

    Returns the most dominant script type found in the text.
    Uses character counting to determine dominance when multiple scripts are present.

    Args:
        text: Text to analyze

    Returns:
        Primary script type

    Examples:
        >>> detect_primary_script("МУР")  # noqa: RUF002
        ScriptType.CYRILLIC
        >>> detect_primary_script("Pink Floyd")
        ScriptType.LATIN
        >>> detect_primary_script("МУР featuring John")  # noqa: RUF002
        ScriptType.MIXED
        >>> detect_primary_script("音楽")
        ScriptType.JAPANESE

    """
    if not text:
        return ScriptType.UNKNOWN

    # Special handling for CJK scripts
    cjk_result = _handle_cjk_detection(text)
    if cjk_result is not None:
        return cjk_result

    # Count characters by script type
    script_counts, total_chars = _count_script_characters(text)

    if total_chars == 0 or not script_counts:
        return ScriptType.UNKNOWN

    # Special case: Latin + one other script
    latin_mixed_result = _handle_latin_mixed_case(script_counts, total_chars)
    if latin_mixed_result is not None:
        return latin_mixed_result

    # Find the script with the most characters
    max_count = max(script_counts.values())
    dominant_scripts = [script for script, count in script_counts.items() if count == max_count]

    return dominant_scripts[0] if len(dominant_scripts) == 1 else ScriptType.MIXED

is_script_type

is_script_type(text, script_type)

Check if text contains a specific script type.

Parameters:

Name Type Description Default
text str

Text to analyze

required
script_type ScriptType

Script type to check for

required

Returns:

Type Description
bool

True if the script type is detected in the text

Examples:

>>> is_script_type("МУР", ScriptType.CYRILLIC)  # noqa: RUF002
True
>>> is_script_type("Pink Floyd", ScriptType.LATIN)
True
>>> is_script_type("音楽", ScriptType.JAPANESE)
True
Source code in src/core/models/script_detection.py
def is_script_type(text: str, script_type: ScriptType) -> bool:
    """Check if text contains a specific script type.

    Args:
        text: Text to analyze
        script_type: Script type to check for

    Returns:
        True if the script type is detected in the text

    Examples:
        >>> is_script_type("МУР", ScriptType.CYRILLIC)  # noqa: RUF002
        True
        >>> is_script_type("Pink Floyd", ScriptType.LATIN)
        True
        >>> is_script_type("音楽", ScriptType.JAPANESE)
        True

    """
    if not text or script_type not in SCRIPT_DETECTORS:
        return False

    return SCRIPT_DETECTORS[script_type](text)

is_primarily_cyrillic

is_primarily_cyrillic(text)

Check if text is primarily in Cyrillic script.

Legacy compatibility function for existing API prioritization logic.

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
bool

True if text is primarily Cyrillic (pure or mixed with Cyrillic dominance)

Examples:

>>> is_primarily_cyrillic("МУР")  # noqa: RUF002
True
>>> is_primarily_cyrillic("МУР feat. John")  # noqa: RUF002
True
>>> is_primarily_cyrillic("Pink Floyd")
False
Source code in src/core/models/script_detection.py
def is_primarily_cyrillic(text: str) -> bool:
    """Check if text is primarily in Cyrillic script.

    Legacy compatibility function for existing API prioritization logic.

    Args:
        text: Text to analyze

    Returns:
        True if text is primarily Cyrillic (pure or mixed with Cyrillic dominance)

    Examples:
        >>> is_primarily_cyrillic("МУР")  # noqa: RUF002
        True
        >>> is_primarily_cyrillic("МУР feat. John")  # noqa: RUF002
        True
        >>> is_primarily_cyrillic("Pink Floyd")
        False

    """
    script = detect_primary_script(text)
    return script in (ScriptType.CYRILLIC, ScriptType.MIXED) and has_cyrillic(text)