Source code for textacy.lang_id.lang_identifier

"""
Language Identification
-----------------------

:mod:`textacy.lang_id`: Interface for de/serializing a language identification model,
and using it to identify the most probable language(s) of a given text. Inspired by
Google's Compact Language Detector v3 (https://github.com/google/cld3) and
implemented with ``thinc`` v8.0.

Model
^^^^^

Character unigrams, bigrams, and trigrams are extracted separately from the first
1000 characters of lower-cased input text. Each collection of ngrams is hash-embedded
into a 100-dimensional space, then averaged. The resulting feature vectors are
concatenated into a single embedding layer, then passed on to a dense layer with
ReLu activation and finally a Softmax output layer. The model's predictions give
the probabilities for a text to be written in ~140 ISO 639-1 languages.

Dataset
^^^^^^^

The model was trained on a randomized, stratified subset of ~375k texts
drawn from several sources:

- **WiLi:** A public dataset of short text extracts from Wikipedias in over 230
  languages. Style is relatively formal; subject matter is "encyclopedic".
  Source: https://zenodo.org/record/841984
- **Tatoeba:** A crowd-sourced collection of sentences and their translations into
  many languages. Style is relatively informal; subject matter is a variety of
  everyday things and goings-on.
  Source: https://tatoeba.org/eng/downloads.
- **UDHR:** The UN's Universal Declaration of Human Rights document, translated into
  hundreds of languages and split into paragraphs. Style is formal; subject matter is
  fundamental human rights to be universally protected.
  Source: https://unicode.org/udhr/index.html
- **DSLCC**: Two collections of short excerpts of journalistic texts in a handful
  of language groups that are highly similar to each other. Style is relatively formal;
  subject matter is current events.
  Source: http://ttg.uni-saarland.de/resources/DSLCC/

Performance
^^^^^^^^^^^

The trained model achieved F1 = 0.97 when averaged over all languages.

A few languages have worse performance; for example, the two Norwegians ("nb" and "no"),
as well as Bosnian ("bs"), Serbian ("sr"), and Croatian ("hr"), which are extremely
similar to each other. See the textacy-data releases for more details:
https://github.com/bdewilde/textacy-data/releases/tag/lang-identifier-v2.0
"""
from __future__ import annotations

import logging
import pathlib
import urllib
from typing import List, Tuple

from thinc.api import Model

from . import models
from .. import constants, utils


LOGGER = logging.getLogger(__name__)


[docs]class LangIdentifier: """ Args: version data_dir model_base Attributes: model classes """ def __init__( self, version: float | str, data_dir: str | pathlib.Path = constants.DEFAULT_DATA_DIR.joinpath("lang_identifier"), model_base: Model = models.LangIdentifierModelV2(), ): self.data_dir = utils.to_path(data_dir) self.version = str(version) self._model_base = model_base self._model = None self._classes = None @property def model_id(self) -> str: return f"lang-identifier-v{self.version}" @property def model_fpath(self) -> pathlib.Path: return self.data_dir.joinpath(f"{self.model_id}.bin") @property def model(self) -> Model: if self._model is None: self._model = self.load_model() return self._model @property def classes(self): if self._classes is None: self._classes = self.model.layers[-1].attrs["classes"] return self._classes
[docs] def save_model(self): """Save trained :attr:`LangIdentifier.model` to disk, as bytes.""" LOGGER.info("saving LangIdentifier model to %s", self.model_fpath) self.model.to_disk(self.model_fpath)
[docs] def load_model(self) -> Model: """ Load trained model from bytes on disk, using :attr:`LangIdentifier.model_base` as the framework into which the data is fit. """ try: LOGGER.debug("loading LangIdentifier model from %s", self.model_fpath) return self._model_base.from_disk(self.model_fpath) except FileNotFoundError: LOGGER.exception( "LangIdentifier model not found at %s -- have you downloaded it yet?", self.model_fpath, ) raise
[docs] def download(self, force: bool = False): """ Download version-specific model data as a binary file and save it to disk at :attr:`LangIdentifier.model_fpath`. Args: force: If True, download the model data, even if it already exists on disk under :attr:`self.data_dir`; otherwise, don't. """ # hide this import, since we'll only ever need it _once_ (per model version) from .. import io as tio model_fname = self.model_fpath.name url = urllib.parse.urljoin( "https://github.com/bdewilde/textacy-data/releases/download/", self.model_id + "/" + model_fname, ) tio.utils.download_file( url, filename=model_fname, dirpath=self.data_dir, force=force, )
[docs] def identify_lang( self, text: str, with_probs: bool = False, ) -> str | Tuple[str, float]: """ Identify the most probable language identified in ``text``, with or without the corresponding probability. Args: text with_probs Returns: ISO 639-1 standard language code of the most probable language, optionally with its probability. """ if not self._is_valid_text(text): result = ("un", 1.0) else: text_ = utils.to_collection(text, str, list) result = models.get_topn_preds_and_probs( self.model.predict(text_), 1, self.classes )[0][0] return result[0] if with_probs is False else result
[docs] def identify_topn_langs( self, text: str, topn: int = 3, with_probs: bool = False, ) -> List[str] | List[Tuple[str, float]]: """ Identify the ``topn`` most probable languages identified in ``text``, with or without the corresponding probabilities. Args: text topn with_probs Returns: ISO 639-1 standard language code and optionally with its probability of the ``topn`` most probable languages. """ if not self._is_valid_text(text): results = [("un", 1.0)] else: text_ = utils.to_collection(text, str, list) results = models.get_topn_preds_and_probs( self.model.predict(text_), topn, self.classes )[0] return [lang for lang, _ in results] if with_probs is False else results
def _is_valid_text(self, text: str) -> bool: return any(char.isalpha() for char in text)
lang_identifier = LangIdentifier( version="2.0", data_dir=constants.DEFAULT_DATA_DIR.joinpath("lang_identifier"), model_base=models.LangIdentifierModelV2(), ) # expose this as primary user-facing API # TODO: there's gotta be a better way, this whole setup feels clunky identify_lang = lang_identifier.identify_lang identify_topn_langs = lang_identifier.identify_topn_langs