utils

Functions for preprocessing texts. Not language-specific.

cltk_normalize

cltk_normalize(
    text: str, compatibility: bool = True
) -> str

Normalize text to NFC or NFKC, defaulting to compatibility form.

Source code in cltk/text/utils.py

def cltk_normalize(text: str, compatibility: bool = True) -> str:
    """Normalize text to NFC or NFKC, defaulting to compatibility form."""
    if compatibility:
        return normalize("NFKC", text)
    else:
        return normalize("NFC", text)

remove_non_ascii

remove_non_ascii(input_string: str) -> str

Remove non-ascii characters.

Source: http://stackoverflow.com/a/1342373

Source code in cltk/text/utils.py

def remove_non_ascii(input_string: str) -> str:
    """Remove non-ascii characters.

    Source: http://stackoverflow.com/a/1342373
    """
    no_ascii = "".join(i for i in input_string if ord(i) < 128)
    return no_ascii

remove_non_latin

remove_non_latin(
    input_string: str, also_keep: Optional[list[str]] = None
) -> str

Remove non-Latin characters.

also_keep should be a list which will add chars (e.g. punctuation) that will not be filtered.

Source code in cltk/text/utils.py

def remove_non_latin(input_string: str, also_keep: Optional[list[str]] = None) -> str:
    """Remove non-Latin characters.

    `also_keep` should be a list which will add chars (e.g. punctuation)
    that will not be filtered.
    """
    if also_keep:
        also_keep += [" "]
    else:
        also_keep = [" "]
    latin_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    latin_chars += latin_chars.lower()
    latin_chars += "".join(also_keep)
    no_latin = "".join([char for char in input_string if char in latin_chars])
    return no_latin

split_trailing_punct

split_trailing_punct(
    text: str, punctuation: Optional[list[str]] = None
) -> str

Split trailing punctuation from words.

Some tokenizers, including that in Stanza, do not always handle punctuation properly. For example, a trailing colon ("οἶδα:") is not split into an extra punctuation token. This function does such splitting on raw text before being sent to such a tokenizer.

Parameters:

text (str) –

Input text string.
punctuation (Optional[list[str]], default: None ) –

List of punctuation that should be split when trailing a word.

Returns:

str –

Text string with trailing punctuation separated by a whitespace character.

Examples:

raw_text = "κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν"
split_trailing_punct(text=raw_text)
# 'κατηγόρων ’, οὐκ οἶδα : ἐγὼ δ᾽ οὖν'

Source code in cltk/text/utils.py

def split_trailing_punct(text: str, punctuation: Optional[list[str]] = None) -> str:
    """Split trailing punctuation from words.

    Some tokenizers, including that in Stanza, do not always
    handle punctuation properly. For example, a trailing colon (``"οἶδα:"``)
    is not split into an extra punctuation token. This function
    does such splitting on raw text before being sent to such
    a tokenizer.

    Args:
        text: Input text string.
        punctuation: List of punctuation that should be split when trailing a word.

    Returns:
        Text string with trailing punctuation separated by a whitespace character.

    Examples:
        ```python
        raw_text = "κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν"
        split_trailing_punct(text=raw_text)
        # 'κατηγόρων ’, οὐκ οἶδα : ἐγὼ δ᾽ οὖν'
        ```

    """
    if not punctuation:
        # What about the curly thing (``᾽``) in eg ``δ᾽``
        punctuation = [":", "’", "”"]  # closing curly quotes
    new_chars: list[str] = list()
    for index, char in enumerate(text):
        if char in punctuation and index > 0:
            # Check whether the punct is attached to a word end
            prev_char = text[index - 1]
            # If a space already before the punct, then don't add space
            if prev_char.isspace():
                new_chars.append(char)
            else:
                # If no whitespace before, then do the split
                new_chars.append(f" {char}")
        else:
            new_chars.append(char)
    return "".join(new_chars)

split_leading_punct

split_leading_punct(
    text: str, punctuation: Optional[list[str]] = None
) -> str

Split leading punctuation from words.

Some tokenizers, including that in Stanza, do not always handle punctuation properly. For example, an open curly quote ("‘κατηγόρων’") is not split into an extra punctuation token. This function does such splitting on raw text before being sent to such a tokenizer.

Parameters:

text (str) –

Input text string.
punctuation (Optional[list[str]], default: None ) –

List of punctuation that should be split when before a word.

Returns:

str –

Text string with leading punctuation separated by a whitespace character.

Examples:

raw_text = "‘κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν"
split_leading_punct(text=raw_text)
# '‘ κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν'

Source code in cltk/text/utils.py

def split_leading_punct(text: str, punctuation: Optional[list[str]] = None) -> str:
    """Split leading punctuation from words.

    Some tokenizers, including that in Stanza, do not always
    handle punctuation properly. For example, an open curly
    quote  (``"‘κατηγόρων’"``) is not split into an extra punctuation
    token. This function does such splitting on raw text before
    being sent to such a tokenizer.

    Args:
        text: Input text string.
        punctuation: List of punctuation that should be split when before a word.

    Returns:
        Text string with leading punctuation separated by a whitespace character.

    Examples:
        ```python
        raw_text = "‘κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν"
        split_leading_punct(text=raw_text)
        # '‘ κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν'
        ```

    """
    if not punctuation:
        punctuation = ["‘", "“"]  # opening curly quotes
    new_chars: list[str] = list()
    last_char_idx = len(text) - 1
    for index, char in enumerate(text):
        # If at end of string, don't split
        if index == last_char_idx:
            new_chars.append(char)
            continue
        next_char = text[index + 1]
        # If there is already a whitespace ahead, do not add another
        if next_char.isspace():
            new_chars.append(char)
            continue
        else:
            if char in punctuation:
                new_chars.append(f"{char} ")
            else:
                new_chars.append(char)
    return "".join(new_chars)

remove_odd_punct

remove_odd_punct(
    text: str, punctuation: Optional[list[str]] = None
) -> str

Remove certain characters that downstream processes do not handle well.

It would be better to use split_leading_punct() and split_trailing_punct(), however the default models out of Stanza make very strange mistakes when, e.g., "‘" is made its own token.

What to do about the apostrophe following an elision (e.g., "δ᾽"")?

Examples:

raw_text = "‘κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν"
remove_odd_punct(raw_text)
# 'κατηγόρων, οὐκ οἶδα ἐγὼ δ᾽ οὖν'

Source code in cltk/text/utils.py

def remove_odd_punct(text: str, punctuation: Optional[list[str]] = None) -> str:
    """Remove certain characters that downstream processes do not handle well.

    It would be better to use ``split_leading_punct()``
    and ``split_trailing_punct()``, however the default models
    out of Stanza make very strange mistakes when, e.g., ``"‘"``
    is made its own token.

    What to do about the apostrophe following an elision (e.g.,
    ``"δ᾽""``)?

    Examples:
        ```python
        raw_text = "‘κατηγόρων’, οὐκ οἶδα: ἐγὼ δ᾽ οὖν"
        remove_odd_punct(raw_text)
        # 'κατηγόρων, οὐκ οἶδα ἐγὼ δ᾽ οὖν'
        ```

    """
    if not punctuation:
        punctuation = ["‘", "“", "’", "”"]
    chars: list[str] = [char for char in text if char not in punctuation]
    return "".join(chars)

strip_section_numbers

strip_section_numbers(text: str) -> str

Remove section numbers like '1.2.2', '[55]', '[55A]', '1.2.2A', '55', '55B', '2:4b' from the text.

Source code in cltk/text/utils.py

def strip_section_numbers(text: str) -> str:
    """Remove section numbers like '1.2.2', '[55]', '[55A]', '1.2.2A', '55', '55B', '2:4b' from the text."""
    # Remove bracketed numbers like [55], [55A]
    text = re.sub(r"\[\d+[A-Z]?\]", "", text)
    # Remove numbers like 1.2.2 or 1.2.2A (at word boundaries)
    text = re.sub(r"\b\d+(?:\.\d+)+[A-Z]?\b", "", text)
    # Remove standalone numbers or numbers with a trailing capital letter (at word boundaries)
    text = re.sub(r"\b\d+[A-Z]?\b", "", text)
    # Remove patterns like 2:4b or 12:34A at the start of a line or after whitespace
    text = re.sub(r"(^|\s)\d+:\d+[A-Za-z]?\b", " ", text, flags=re.MULTILINE)
    # Remove extra spaces left behind
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()