Source code for textacy.datasets.oxford_text_archive

"""
Oxford Text Archive literary works
----------------------------------

A collection of ~2.7k Creative Commons literary works from the Oxford Text Archive,
containing primarily English-language 16th-20th century literature and history.

Records include the following data:

    - ``text``: Full text of the literary work.
    - ``title``: Title of the literary work.
    - ``author``: Author(s) of the literary work.
    - ``year``: Year that the literary work was published.
    - ``url``: URL at which literary work can be found online via the OTA.
    - ``id``: Unique identifier of the literary work within the OTA.

This dataset was compiled by David Mimno from the Oxford Text Archive and
stored in his GitHub repo to avoid unnecessary scraping of the OTA site. It is
downloaded from that repo, and excluding some light cleaning of its metadata,
is reproduced exactly here.
"""
import csv
import io
import itertools
import logging
import pathlib
import os
import re
from typing import Iterable, Optional, Set, Tuple, Union

from .. import constants, types, utils
from .. import io as tio
from .base import Dataset

LOGGER = logging.getLogger(__name__)

NAME = "oxford_text_archive"
META = {
    "site_url": "https://ota.ox.ac.uk/",
    "description": (
        "Collection of ~2.7k Creative Commons texts from the Oxford Text "
        "Archive, containing primarily English-language 16th-20th century "
        "literature and history."
    ),
}
DOWNLOAD_URL = "https://github.com/mimno/ota/archive/master.zip"


[docs]class OxfordTextArchive(Dataset): """ Stream a collection of English-language literary works from text files on disk, either as texts or text + metadata pairs. Download the data (one time only!), saving and extracting its contents to disk:: >>> import textacy.datasets >>> ds = textacy.datasets.OxfordTextArchive() >>> ds.download() >>> ds.info {'name': 'oxford_text_archive', 'site_url': 'https://ota.ox.ac.uk/', 'description': 'Collection of ~2.7k Creative Commons texts from the Oxford Text Archive, containing primarily English-language 16th-20th century literature and history.'} Iterate over literary works as texts or records with both text and metadata:: >>> for text in ds.texts(limit=3): ... print(text[:200]) >>> for text, meta in ds.records(limit=3): ... print("\\n{}, {}".format(meta["title"], meta["year"])) ... print(text[:300]) Filter literary works by a variety of metadata fields and text length:: >>> for text, meta in ds.records(author="Shakespeare, William", limit=1): ... print("{}\\n{}".format(meta["title"], text[:500])) >>> for text, meta in ds.records(date_range=("1900-01-01", "1990-01-01"), limit=5): ... print(meta["year"], meta["author"]) >>> for text in ds.texts(min_len=4000000): ... print(len(text)) Stream literary works into a :class:`textacy.Corpus <textacy.corpus.Corpus>`:: >>> textacy.Corpus("en", data=ds.records(limit=5)) Corpus(5 docs; 182289 tokens) Args: data_dir (str or :class:`pathlib.Path`): Path to directory on disk under which dataset is stored, i.e. ``/path/to/data_dir/oxford_text_archive``. Attributes: full_date_range: First and last dates for which works are available, each as an ISO-formatted string (YYYY-MM-DD). authors (Set[str]): Full names of all distinct authors included in this dataset, e.g. "Shakespeare, William". """ full_date_range: Tuple[str, str] = ("0018-01-01", "1990-01-01") def __init__( self, data_dir: Union[str, pathlib.Path] = constants.DEFAULT_DATA_DIR.joinpath(NAME), ): super().__init__(NAME, meta=META) self.data_dir = utils.to_path(data_dir).resolve() self._text_dirpath = self.data_dir.joinpath("master", "text") self._metadata_filepath = self.data_dir.joinpath("master", "metadata.tsv") self._metadata = None
[docs] def download(self, *, force: bool = False) -> None: """ Download the data as a zip archive file, then save it to disk and extract its contents under the :attr:`OxfordTextArchive.data_dir` directory. Args: force: If True, download the dataset, even if it already exists on disk under ``data_dir``. """ filepath = tio.download_file( DOWNLOAD_URL, filename=None, dirpath=self.data_dir, force=force, ) if filepath: tio.unpack_archive(filepath, extract_dir=None)
@property def metadata(self): """Dict[str, dict]""" if not self._metadata: try: self._metadata = self._load_and_parse_metadata() except OSError as e: LOGGER.error(e) return self._metadata def _load_and_parse_metadata(self): """ Read in ``metadata.tsv`` file from :attr:`OxfordTextArchive._metadata_filepath`` zip archive; convert into a dictionary keyed by record ID; clean up some of the fields, and remove a couple fields that are identical throughout. """ if not self._metadata_filepath.is_file(): raise OSError( f"metadata file {self._metadata_filepath} not found;\n" "has the dataset been downloaded yet?" ) re_extract_year = re.compile(r"(\d{4})") re_extract_authors = re.compile( r"(\D+)" r"(?:, " r"(?:[bdf]l?\. )?(?:ca. )?\d{4}(?:\?| or \d{1,2})?(?:-(?:[bdf]l?\. )?(?:ca. )?\d{4}(?:\?| or \d{1,2})?)?|" r"(?:\d{2}th(?:/\d{2}th)? cent\.)" r"\.?)" ) re_clean_authors = re.compile(r"^[,;. ]+|[,.]+\s*?$") metadata = {} with self._metadata_filepath.open(mode="rb") as f: subf = io.StringIO(f.read().decode("utf-8")) for row in csv.DictReader(subf, delimiter="\t"): # only include English-language works (99.9% of all works) if not row["Language"].startswith("English"): continue # clean up years year_match = re_extract_year.search(row["Year"]) if year_match: row["Year"] = year_match.group() else: row["Year"] = None # extract and clean up authors authors = re_extract_authors.findall(row["Author"]) or [row["Author"]] row["Author"] = tuple( re_clean_authors.sub("", author) for author in authors ) row["Title"] = row["Title"].strip() # get rid of uniform "Language" and "License" fields del row["Language"] del row["License"] metadata[row["ID"]] = {key.lower(): val for key, val in row.items()} # set authors attribute for user convenience / to validate author filtering self.authors = { author for value in metadata.values() for author in value["author"] if value.get("author") } return metadata def __iter__(self): if not self._text_dirpath.is_dir(): raise OSError( f"text directory {self._text_dirpath} not found;\n" "has the dataset been downloaded yet?" ) _metadata = self.metadata # for performance for filepath in sorted(tio.get_filepaths(self._text_dirpath, extension=".txt")): id_, _ = os.path.splitext(os.path.basename(filepath)) record = _metadata.get(id_, {}).copy() if not record: LOGGER.debug( "no metadata found for record %s; probably non-English text...", id_ ) continue with io.open(filepath, mode="rt", encoding="utf-8") as f: record["text"] = f.read() yield record def _get_filters(self, author, date_range, min_len): filters = [] if min_len is not None: if min_len < 1: raise ValueError("`min_len` must be at least 1") filters.append(lambda record: len(record.get("text", "")) >= min_len) if author is not None: author = utils.validate_set_members( author, (str, bytes), valid_vals=self.authors ) filters.append( lambda record: record.get("author") and any(athr in author for athr in record["author"]) ) if date_range is not None: date_range = utils.validate_and_clip_range( date_range, self.full_date_range, val_type=(str, bytes) ) filters.append( lambda record: record.get("year") and date_range[0] <= record["year"] < date_range[1] ) return filters def _filtered_iter(self, filters): if filters: for record in self: if all(filter_(record) for filter_ in filters): yield record else: for record in self: yield record
[docs] def texts( self, *, author: Optional[Union[str, Set[str]]] = None, date_range: Optional[Tuple[Optional[str], Optional[str]]] = None, min_len: Optional[int] = None, limit: Optional[int] = None, ) -> Iterable[str]: """ Iterate over works in this dataset, optionally filtering by a variety of metadata and/or text length, and yield texts only. Args: author: Filter texts by the authors' name. For multiple values (Set[str]), ANY rather than ALL of the authors must be found among a given works's authors. date_range: Filter texts by the date on which it was published; both start and end date must be specified, but a null value for either will be replaced by the min/max date available in the dataset. min_len: Filter texts by the length (# characters) of their text content. limit: Yield no more than ``limit`` texts that match all specified filters. Yields: Text of the next work in dataset passing all filters. Raises: ValueError: If any filtering options are invalid. """ filters = self._get_filters(author, date_range, min_len) for record in itertools.islice(self._filtered_iter(filters), limit): yield record["text"]
[docs] def records( self, *, author: Optional[Union[str, Set[str]]] = None, date_range: Optional[Tuple[Optional[str], Optional[str]]] = None, min_len: Optional[int] = None, limit: Optional[int] = None, ) -> Iterable[types.Record]: """ Iterate over works in this dataset, optionally filtering by a variety of metadata and/or text length, and yield text + metadata pairs. Args: author: Filter texts by the authors' name. For multiple values (Set[str]), ANY rather than ALL of the authors must be found among a given works's authors. date_range: Filter texts by the date on which it was published; both start and end date must be specified, but a null value for either will be replaced by the min/max date available in the dataset. min_len: Filter texts by the length (# characters) of their text content. limit: Yield no more than ``limit`` texts that match all specified filters. Yields: Text of the next work in dataset passing all filters, and its corresponding metadata. Raises: ValueError: If any filtering options are invalid. """ filters = self._get_filters(author, date_range, min_len) for record in itertools.islice(self._filtered_iter(filters), limit): yield types.Record(text=record.pop("text"), meta=record)