Source code for textacy.datasets.udhr

"""
UDHR translations
-----------------

A collection of translations of the Universal Declaration of Human Rights (UDHR),
a milestone document in the history of human rights that first, formally established
fundamental human rights to be universally protected.

Records include the following fields:

    - ``text``: Full text of the translated UDHR document.
    - ``lang``: ISO-639-1 language code of the text.
    - ``lang_name``: Ethnologue entry for the language (see https://www.ethnologue.com).

The source dataset was compiled and is updated by the Unicode Consortium
as a way to demonstrate the use of unicode in representing a wide variety of languages.
In fact, the UDHR was chosen because it's been translated into more languages
than any other document! However, this dataset only provides access to records
translated into ISO-639-1 languages — that is, major living languages *only*,
rather than every language, major or minor, that has ever existed. If you need access
to texts in those other languages, you can find them at :attr:`UDHR._texts_dirpath`.

For more details, go to https://unicode.org/udhr.
"""
import io
import itertools
import logging
import pathlib
import xml
from typing import Any, Dict, Iterable, List, Optional, Set, Union

from .. import constants, preprocessing, types, utils
from .. import io as tio
from .base import Dataset

LOGGER = logging.getLogger(__name__)

NAME = "udhr"
META = {
    "site_url": "http://www.ohchr.org/EN/UDHR",
    "description": (
        "A collection of translations of the Universal Declaration of Human Rights (UDHR), "
        "a milestone document in the history of human rights that first, formally established "
        "fundamental human rights to be universally protected."
    ),
}
DOWNLOAD_URL = "https://unicode.org/udhr/assemblies/udhr_txt.zip"


[docs]class UDHR(Dataset): """ Stream a collection of UDHR translations from disk, either as texts or text + metadata pairs. Download the data (one time only!), saving and extracting its contents to disk:: >>> import textacy.datasets >>> ds = textacy.datasets.UDHR() >>> ds.download() >>> ds.info {'name': 'udhr', 'site_url': 'http://www.ohchr.org/EN/UDHR', 'description': 'A collection of translations of the Universal Declaration of Human Rights (UDHR), a milestone document in the history of human rights that first, formally established fundamental human rights to be universally protected.'} Iterate over translations as texts or records with both text and metadata:: >>> for text in ds.texts(limit=5): ... print(text[:500]) >>> for text, meta in ds.records(limit=5): ... print("\\n{} ({})\\n{}".format(meta["lang_name"], meta["lang"], text[:500])) Filter translations by language, and note that some languages have multiple translations:: >>> for text, meta in ds.records(lang="en"): ... print("\\n{} ({})\\n{}".format(meta["lang_name"], meta["lang"], text[:500])) >>> for text, meta in ds.records(lang="zh"): ... print("\\n{} ({})\\n{}".format(meta["lang_name"], meta["lang"], text[:500])) Note: Streaming translations into a :class:`textacy.Corpus <textacy.corpus.Corpus>` doesn't work as for other available datasets, since this dataset is multilingual. Args: data_dir (str or :class:`pathlib.Path`): Path to directory on disk under which the data is stored, i.e. ``/path/to/data_dir/udhr``. Attributes: langs (Set[str]): All distinct language codes with texts in this dataset, e.g. "en" for English. """ def __init__( self, data_dir: Union[str, pathlib.Path] = constants.DEFAULT_DATA_DIR.joinpath(NAME), ): super().__init__(NAME, meta=META) self.data_dir = utils.to_path(data_dir).resolve() self._texts_dirpath = self.data_dir.joinpath("udhr_txt") self._index_filepath = self._texts_dirpath.joinpath("index.xml") self._index = None self.langs = None
[docs] def download(self, *, force: bool = False) -> None: """ Download the data as a zipped archive of language-specific text files, then save it to disk and extract its contents under the ``data_dir`` directory. Args: force: If True, download the dataset, even if it already exists on disk under ``data_dir``. """ filepath = tio.download_file( DOWNLOAD_URL, filename="udhr_txt.zip", dirpath=self.data_dir, force=force, ) if filepath: tio.unpack_archive(filepath, extract_dir=self.data_dir.joinpath("udhr_txt")) self._check_data()
def _check_data(self): """Check that necessary data is found on disk, or raise an OSError.""" if not self._texts_dirpath.is_dir(): raise OSError( f"data directory {self._texts_dirpath} not found; " "has the dataset been downloaded?" ) if not self._index_filepath.is_file(): raise OSError( f"data index file {self._index_filepath} not found; " "has the dataset been downloaded?" ) @property def index(self) -> Optional[List[Dict[str, Any]]]: if not self._index: try: self._index = self._load_and_parse_index() except OSError as e: LOGGER.error(e) return self._index def _load_and_parse_index(self): """ Read in index xml file from :attr:`UDHR._index_filepath`; skip elements without valid ISO-639-1 language code or sufficient translation quality, then convert into a list of dicts with key metadata, including filenames. """ index = [] tree = xml.etree.ElementTree.parse(self._index_filepath) root = tree.getroot() for ele in root.iterfind("udhr"): iso_lang_code = ele.get("bcp47", "").split("-", 1)[0] stage = int(ele.get("stage")) if len(iso_lang_code) != 2 or stage < 3: continue else: index.append( { "filename": f"udhr_{ele.get('f')}.txt", "lang": iso_lang_code, "lang_name": ele.get("n"), } ) # get set of all available langs, so users can filter on it self.langs = {item["lang"] for item in index} return index def _load_and_parse_text_file(self, filepath): with io.open(filepath, mode="rt", encoding="utf-8") as f: text_lines = [line.strip() for line in f.readlines()] # chop off the header, if it exists try: header_idx = text_lines.index("---") text_lines = text_lines[header_idx + 1 :] except ValueError: pass return preprocessing.normalize.whitespace("\n".join(text_lines)) def __iter__(self): self._check_data() for item in self.index: filepath = self._texts_dirpath.joinpath(item["filename"]) record = item.copy() record["text"] = self._load_and_parse_text_file(filepath) yield record def _filtered_iter(self, lang): # this dataset is unusual in that the only filter we can really offer is lang # so we might as well avoid loading texts in unwanted languages if lang: self._check_data() lang = utils.validate_set_members(lang, str, valid_vals=self.langs) for item in self.index: if item["lang"] in lang: filepath = self._texts_dirpath.joinpath(item["filename"]) record = item.copy() record["text"] = self._load_and_parse_text_file(filepath) yield record else: for record in self: yield record
[docs] def texts( self, *, lang: Optional[Union[str, Set[str]]] = None, limit: Optional[int] = None, ) -> Iterable[str]: """ Iterate over records in this dataset, optionally filtering by language, and yield texts only. Args: lang: Filter records by the language in which they're written; see :attr:`UDHR.langs`. limit: Yield no more than ``limit`` texts that match specified filter. Yields: Text of the next record in dataset passing filters. Raises: ValueError: If any filtering options are invalid. """ for record in itertools.islice(self._filtered_iter(lang), limit): yield record["text"]
[docs] def records( self, *, lang: Optional[Union[str, Set[str]]] = None, limit: Optional[int] = None, ) -> Iterable[types.Record]: """ Iterate over reocrds in this dataset, optionally filtering by a language, and yield text + metadata pairs. Args: lang: Filter records by the language in which they're written; see :attr:`UDHR.langs`. limit: Yield no more than ``limit`` texts that match specified filter. Yields: Text of the next record in dataset passing filters, and its corresponding metadata. Raises: ValueError: If any filtering options are invalid. """ for record in itertools.islice(self._filtered_iter(lang), limit): yield types.Record(text=record.pop("text"), meta=record)