close
Skip to content

cltk

Lightweight package init exposing the public API.

Goals: - Avoid importing heavy dependencies or performing I/O at import time. - Re-export only stable, leaf-level names. - Define __all__ to the exact public surface.

NLP

NLP(
    language_code: Optional[str] = None,
    backend: BACKEND_TYPES = "stanza",
    model: Optional[
        Union[str, AVAILABLE_OPENAI_MODELS]
    ] = None,
    custom_pipeline: Optional[Pipeline] = None,
    suppress_banner: bool = False,
    cltk_config: Optional[CLTKConfig] = None,
)

Convenience facade for running CLTK pipelines.

Parameters:

  • language_code (Optional[str], default: None ) –

    Language key (Glottolog code, ISO code, or exact name). Required unless cltk_config is provided.

  • backend (BACKEND_TYPES, default: 'stanza' ) –

    One of "stanza" (default), "openai", "ollama", "ollama-cloud", or mistral. The "spacy" backend is not yet implemented and will raise NotImplementedError.

  • model (Optional[Union[str, AVAILABLE_OPENAI_MODELS]], default: None ) –

    Optional model name when using generative backends ("openai", "ollama", "ollama-cloud", mistral). Ignored for "stanza".

  • custom_pipeline (Optional[Pipeline], default: None ) –

    Optional pipeline to use instead of the default mapping.

  • suppress_banner (bool, default: False ) –

    If true, suppresses informational console output.

  • cltk_config (Optional[CLTKConfig], default: None ) –

    Optional :class:~cltk.core.data_types.CLTKConfig bundle. When provided, its values override the other constructor arguments.

Notes
  • When backend == "openai" and no model is provided, defaults to "gpt-5-mini". Requires OPENAI_API_KEY in the environment.
  • When backend is "ollama" or "ollama-cloud" and no model is provided, defaults to "llama3.1:8b". "ollama-cloud" requires OLLAMA_CLOUD_API_KEY in the environment.
  • The "stanza" backend does not accept a model parameter; language models are bound to the pipeline for each language.
Source code in cltk/nlp.py
def __init__(
    self,
    language_code: Optional[str] = None,
    backend: BACKEND_TYPES = "stanza",
    model: Optional[Union[str, AVAILABLE_OPENAI_MODELS]] = None,
    custom_pipeline: Optional[Pipeline] = None,
    suppress_banner: bool = False,
    cltk_config: Optional["CLTKConfig"] = None,
) -> None:
    self.cltk_config: Optional[CLTKConfig] = cltk_config
    backend_config: Optional[ModelConfig] = (
        cltk_config.active_backend_config if cltk_config else None
    )
    stanza_model_override: Optional[str] = None
    self.suppress_banner: bool = suppress_banner
    config_language: Optional[Language] = None
    if cltk_config:
        language_code = cltk_config.language_code
        config_language = cltk_config.language
        backend = cltk_config.backend
        self.suppress_banner = cltk_config.suppress_banner
        if cltk_config.custom_pipeline is not None:
            custom_pipeline = cltk_config.custom_pipeline
        config_model_value = cltk_config.model
        if config_model_value is not None:
            model = config_model_value
        if model is None and backend_config is not None:
            model = getattr(backend_config, "model", None)
        if isinstance(backend_config, StanzaBackendConfig):
            stanza_model_override = backend_config.model
            # Preserve stanza restriction against explicit model arg
            model = None

    if language_code is None and config_language is None:
        raise ValueError(
            "language_code is required when no CLTKConfig is provided."
        )
    if language_code is None and config_language is not None:
        language_code = config_language.glottolog_id

    bind_context(glottolog_id=language_code).info(
        f"Initializing NLP for language: {language_code}"
    )
    # self.language: Language = get_language(lang_id=language_code)
    self.language: Language
    self.dialect: Optional[Dialect]
    if config_language is not None:
        self.language = config_language
        self.dialect = None
    else:
        if language_code is None:
            raise ValueError(
                "language_code is required when no CLTKConfig is provided."
            )
        self.language, self.dialect = get_language(lang_id=language_code)
    self.language_code: str
    resolved_code: Optional[str]
    if self.dialect:
        resolved_code = self.dialect.glottolog_id
    else:
        resolved_code = self.language.glottolog_id
    if resolved_code is None:
        raise ValueError(
            "Resolved language is missing glottolog_id; cannot select a pipeline."
        )
    self.language_code = resolved_code
    self.backend: BACKEND_TYPES = backend
    self.model: Optional[str] = model
    self._backend_config: Optional[ModelConfig] = backend_config
    self._stanza_model_override: Optional[str] = stanza_model_override
    self._ollama_cloud_api_key: Optional[str] = None
    self.api_key: Optional[str] = None
    if self.backend == "openai":
        # Prefer API key from config when provided
        self.api_key = getattr(backend_config, "api_key", None)
        if not self.api_key:
            load_env_file()
            self.api_key = os.getenv("OPENAI_API_KEY")
        if not self.api_key:
            openai_msg: str = "API key for OpenAI not found."
            logger.error(openai_msg)
            raise ValueError(openai_msg)
        # Default model if none provided
        self.model = self.model or getattr(backend_config, "model", None)
        self.model = self.model or "gpt-5-mini"
    elif self.backend in ("ollama", "ollama-cloud"):
        if self.backend == "ollama-cloud":
            # Prefer API key from config when provided
            self._ollama_cloud_api_key = getattr(backend_config, "api_key", None)
            if not self._ollama_cloud_api_key:
                load_env_file()
                self._ollama_cloud_api_key = os.getenv("OLLAMA_CLOUD_API_KEY")
            if not self._ollama_cloud_api_key:
                msg = "API key for Ollama Cloud not found."
                logger.error(msg)
                raise ValueError(msg)
        # Default model if none provided
        self.model = self.model or getattr(backend_config, "model", None)
        self.model = self.model or "llama3.1:8b"
    elif self.backend == "stanza":
        try:
            ensure_stanza_available()
        except ImportError as e:
            logger.error(str(e))
            raise
        # Stanza models are bound to language pipelines; reject explicit model
        if self.model is not None:
            raise ValueError(
                "The 'stanza' backend does not accept a model parameter; models are hardcoded per language."
            )
    elif self.backend == "mistral":
        # Prefer API key from config when provided
        self.api_key = getattr(backend_config, "api_key", None)
        if not self.api_key:
            load_env_file()
            self.api_key = os.getenv("MISTRAL_API_KEY")
        if not self.api_key:
            mistral_msg: str = "API key for Mistral not found."
            logger.error(mistral_msg)
            raise ValueError(mistral_msg)
        # Default model if none provided
        self.model = self.model or getattr(backend_config, "model", None)
        self.model = self.model or "mistral-medium-latest"
    self.pipeline: Pipeline = (
        custom_pipeline if custom_pipeline else self._get_pipeline()
    )
    # Ensure GenAI enrichment runs after dependency for generative backends.
    self._maybe_attach_enrichment_process()
    bind_context(
        glottolog_id=self.language_code,
        model=str(self.model) if getattr(self, "model", None) else None,
    ).debug(f"Pipeline selected: {self.pipeline}")
    if not self.suppress_banner:
        self._print_cltk_info()
        self._print_pipelines_for_current_lang()
        self._print_special_authorship_messages_for_current_lang()

cltk_config instance-attribute

cltk_config: Optional[CLTKConfig] = cltk_config

suppress_banner instance-attribute

suppress_banner: bool = suppress_banner

language instance-attribute

language: Language

dialect instance-attribute

dialect: Optional[Dialect]

language_code instance-attribute

language_code: str = resolved_code

backend instance-attribute

backend: BACKEND_TYPES = backend

model instance-attribute

model: Optional[str] = model

api_key instance-attribute

api_key: Optional[str] = None

pipeline instance-attribute

pipeline: Pipeline = (
    custom_pipeline if custom_pipeline else _get_pipeline()
)

analyze

analyze(text: str) -> Doc

Run text through the selected NLP pipeline and return a document.

Parameters:

  • text (str) –

    Raw text to analyze.

Returns:

  • A ( Doc ) –

    class:~cltk.core.data_types.Doc enriched by each process in the

  • Doc

    pipeline (e.g., sentence boundaries, tokens, features).

Raises:

  • ValueError

    If text is empty or not a string.

  • RuntimeError

    If any process fails during execution.

Source code in cltk/nlp.py
def analyze(self, text: str) -> Doc:
    """Run text through the selected NLP pipeline and return a document.

    Args:
      text: Raw text to analyze.

    Returns:
      A :class:`~cltk.core.data_types.Doc` enriched by each process in the
      pipeline (e.g., sentence boundaries, tokens, features).

    Raises:
      ValueError: If ``text`` is empty or not a string.
      RuntimeError: If any process fails during execution.

    """
    logger.info("Analyzing text with NLP pipeline.")
    if not text or not isinstance(text, str):
        logger.error("Input text must be a non-empty string.")
        raise ValueError("Input text must be a non-empty string.")
    doc: Doc = Doc(language=self.language, raw=text)
    doc.backend = self.backend
    doc.model = getattr(self, "model", None)
    # Expose backend config to downstream processes (LLM options, stanza package)
    if self._backend_config:
        doc.metadata["backend_config"] = self._backend_config
    if self._stanza_model_override:
        doc.metadata["stanza_package"] = self._stanza_model_override
    lang_id = None
    try:
        if doc.dialect and doc.dialect.glottolog_id:
            lang_id = doc.dialect.glottolog_id
        else:
            lang_id = doc.language.glottolog_id
    except Exception:
        lang_id = None
    config_snapshot = extract_doc_config(doc)
    run_record = build_provenance_record(
        language=lang_id,
        backend=doc.backend,
        process="NLP.analyze",
        model=str(doc.model) if doc.model else None,
        provider=str(doc.backend) if doc.backend else None,
        config=config_snapshot,
        notes={
            "pipeline": self.pipeline.__class__.__name__ if self.pipeline else None
        },
    )
    add_provenance_record(doc, run_record, set_default=True)
    log = bind_from_doc(doc)

    processes = cast(
        list[Any],
        self.pipeline.processes if self.pipeline.processes is not None else [],
    )
    if not processes:
        msg: str = "No processes found in pipeline."
        log.error(msg)
        raise RuntimeError(msg)
    for process in processes:
        # Print Process name before each execution
        process_name = (
            process.__name__
            if isinstance(process, type)
            else process.__class__.__name__
        )
        if not self.suppress_banner:
            print(Fore.CYAN + f"⸖ Running {process_name} ..." + Style.RESET_ALL)
        process_obj: Process = self._get_process_object(process_object=process)
        try:
            log.debug(f"Running process: {process_obj.__class__.__name__}")
            doc = process_obj.run(doc)
        except Exception as e:
            log.error(f"Process '{process_obj.__class__.__name__}' failed: {e}")
            raise RuntimeError(
                f"Process '{process_obj.__class__.__name__}' failed: {e}"
            ) from e
    if doc.words is None or not isinstance(doc.words, list):
        msg = "Pipeline did not produce any words. Check your pipeline configuration and input text."
        log.warning(msg)
    for word in doc.words or []:
        try:
            word._doc = doc
        except Exception:
            pass
    log.info("NLP analysis complete.")
    return doc

Submodules