Source code for textacy.datasets.imdb

"""
IMDB movie reviews
------------------

A collection of 50k highly polar movie reviews posted to IMDB, split evenly
into training and testing sets, with 25k positive and 25k negative sentiment labels,
as well as some unlabeled reviews.

Records include the following key fields (plus a few others):

    - ``text``: Full text of the review.
    - ``subset``: Subset of the dataset ("train" or "test") into which
      the review has been split.
    - ``label``: Sentiment label ("pos" or "neg") assigned to the review.
    - ``rating``: Numeric rating assigned by the original reviewer, ranging from
      1 to 10. Reviews with a rating <= 5 are "neg"; the rest are "pos".
    - ``movie_id``: Unique identifier for the movie under review within IMDB,
      useful for grouping reviews or joining with an external movie dataset.

Reference: Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng,
and Christopher Potts. (2011). Learning Word Vectors for Sentiment Analysis.
The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
"""
import io
import itertools
import logging
import os
import re
from typing import Any, Dict, Iterable, Optional, Tuple

from .. import constants, types, utils
from .. import io as tio
from .base import Dataset

LOGGER = logging.getLogger(__name__)

NAME = "imdb"
META = {
    "site_url": "http://ai.stanford.edu/~amaas/data/sentiment",
    "description": (
        "Collection of 50k highly polar movie reviews split evenly "
        "into train and test sets, with 25k positive and 25k negative labels. "
        "Also includes some unlabeled reviews."
    ),
}
DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

RE_MOVIE_ID = re.compile(r"/(tt\d+)/")


[docs]class IMDB(Dataset): """ Stream a collection of IMDB movie reviews from text files on disk, either as texts or text + metadata pairs. Download the data (one time only!), saving and extracting its contents to disk:: >>> import textacy.datasets >>> ds = textacy.datasets.IMDB() >>> ds.download() >>> ds.info {'name': 'imdb', 'site_url': 'http://ai.stanford.edu/~amaas/data/sentiment', 'description': 'Collection of 50k highly polar movie reviews split evenly into train and test sets, with 25k positive and 25k negative labels. Also includes some unlabeled reviews.'} Iterate over movie reviews as texts or records with both text and metadata:: >>> for text in ds.texts(limit=5): ... print(text) >>> for text, meta in ds.records(limit=5): ... print("\\n{} {}\\n{}".format(meta["label"], meta["rating"], text)) Filter movie reviews by a variety of metadata fields and text length:: >>> for text, meta in ds.records(label="pos", limit=5): ... print(meta["rating"], ":", text) >>> for text, meta in ds.records(rating_range=(9, 11), limit=5): ... print(meta["rating"], text) >>> for text in ds.texts(min_len=1000, limit=5): ... print(len(text)) Stream movie reviews into a :class:`textacy.Corpus <textacy.corpus.Corpus>`:: >>> textacy.Corpus("en", data=ds.records(limit=100)) Corpus(100 docs; 24340 tokens) Args: data_dir: Path to directory on disk under which the data is stored, i.e. ``/path/to/data_dir/imdb``. Attributes: full_rating_range: Lowest and highest ratings for which movie reviews are available. """ full_rating_range: Tuple[int, int] = (1, 10) def __init__( self, data_dir=constants.DEFAULT_DATA_DIR.joinpath(NAME), ): super().__init__(NAME, meta=META) self.data_dir = utils.to_path(data_dir).resolve() self._movie_ids = {"train": {}, "test": {}} self._subset_labels = { "train": ("pos", "neg", "unsup"), "test": ("pos", "neg"), } self._subset = None self._label = None
[docs] def download(self, *, force: bool = False) -> None: """ Download the data as a compressed tar archive file, then save it to disk and extract its contents under the ``data_dir`` directory. Args: force: If True, always download the dataset even if it already exists on disk under ``data_dir``. """ filepath = tio.download_file( DOWNLOAD_URL, filename="aclImdb.tar.gz", dirpath=self.data_dir, force=force, ) if filepath: tio.unpack_archive(filepath, extract_dir=None) self._check_data()
def _check_data(self): """Check that necessary data is found on disk, or raise an OSError.""" data_dirpaths = ( self.data_dir.joinpath("aclImdb", subset, label) for subset, labels in self._subset_labels.items() for label in labels ) url_filepaths = ( self.data_dir.joinpath("aclImdb", subset, f"urls_{label}.txt") for subset, labels in self._subset_labels.items() for label in labels ) for dirpath in data_dirpaths: if not dirpath.is_dir(): raise OSError( f"data directory {dirpath} not found; " "has the dataset been downloaded?" ) for filepath in url_filepaths: if not filepath.is_file(): raise OSError( f"data file {filepath} not found; has the dataset been downloaded?" ) def __iter__(self): self._check_data() dirpaths = tuple( self.data_dir.joinpath("aclImdb", subset, label) for subset in self._subset or self._subset_labels.keys() for label in self._label or self._subset_labels[subset] ) for dirpath in dirpaths: for filepath in tio.get_filepaths(dirpath, match_regex=r"^\d+_\d+\.txt$"): yield self._load_record(filepath) def _load_record(self, filepath: str) -> Dict[str, Any]: dirpath, filename = os.path.split(filepath) dirpath, label = os.path.split(dirpath) _, subset = os.path.split(dirpath) id_, rating = filename[:-4].split("_") with io.open(filepath, mode="rt", encoding="utf-8") as f: text = f.read().replace("<br />", "\n").strip() return { "text": text, "subset": subset, "label": label, "rating": int(rating) if label != "unsup" else None, "movie_id": self._get_movie_id(subset, label, int(id_)), } def _get_movie_id(self, subset, label, id_): try: return self._movie_ids[subset][label][id_] except KeyError: fpath = self.data_dir.joinpath("aclImdb", subset, f"urls_{label}.txt") self._movie_ids[subset][label] = { id_: RE_MOVIE_ID.search(line).group(1) for id_, line in enumerate(tio.read_text(fpath, mode="rt", lines=True)) } return self._movie_ids[subset][label][id_] def _get_filters(self, rating_range, min_len): filters = [] if min_len is not None: if min_len < 1: raise ValueError("`min_len` must be at least 1") filters.append(lambda record: len(record.get("text", "")) >= min_len) if rating_range is not None: rating_range = utils.validate_and_clip_range( rating_range, self.full_rating_range, val_type=int ) filters.append( lambda record: ( record.get("rating") and rating_range[0] <= record["rating"] < rating_range[1] ) ) return filters def _filtered_iter(self, filters): if filters: for record in self: if all(filter_(record) for filter_ in filters): yield record else: for record in self: yield record
[docs] def texts( self, *, subset: Optional[str] = None, label: Optional[str] = None, rating_range: Optional[Tuple[Optional[int], Optional[int]]] = None, min_len: Optional[int] = None, limit: Optional[int] = None, ) -> Iterable[str]: """ Iterate over movie reviews in this dataset, optionally filtering by a variety of metadata and/or text length, and yield texts only. Args: subset ({"train", "test"}): Filter movie reviews by the dataset subset into which they've already been split. label ({"pos", "neg", "unsup"}): Filter movie reviews by the assigned sentiment label (or lack thereof, for "unsup"). rating_range: Filter movie reviews by the rating assigned by the reviewer. Only those with ratings in the interval [low, high) are included. Both low and high values must be specified, but a null value for either is automatically replaced by the minimum or maximum valid values, respectively. min_len: Filter reviews by the length (# characters) of their text content. limit: Yield no more than ``limit`` reviews that match all specified filters. Yields: Text of the next movie review in dataset passing all filters. Raises: ValueError: If any filtering options are invalid. """ self._subset = utils.to_collection(subset, (str, bytes), tuple) self._label = utils.to_collection(label, (str, bytes), tuple) try: filters = self._get_filters(rating_range, min_len) for record in itertools.islice(self._filtered_iter(filters), limit): yield record["text"] finally: self._subset = None self._label = None
[docs] def records( self, *, subset: Optional[str] = None, label: Optional[str] = None, rating_range: Optional[Tuple[Optional[int], Optional[int]]] = None, min_len: Optional[int] = None, limit: Optional[int] = None, ) -> Iterable[types.Record]: """ Iterate over movie reviews in this dataset, optionally filtering by a variety of metadata and/or text length, and yield text + metadata pairs. Args: subset ({"train", "test"}): Filter movie reviews by the dataset subset into which they've already been split. label ({"pos", "neg", "unsup"}): Filter movie reviews by the assigned sentiment label (or lack thereof, for "unsup"). rating_range: Filter movie reviews by the rating assigned by the reviewer. Only those with ratings in the interval [low, high) are included. Both low and high values must be specified, but a null value for either is automatically replaced by the minimum or maximum valid values, respectively. min_len: Filter reviews by the length (# characters) of their text content. limit: Yield no more than ``limit`` reviews that match all specified filters. Yields: Text of the next movie review in dataset passing all filters, and its corresponding metadata. Raises: ValueError: If any filtering options are invalid. """ self._subset = utils.to_collection(subset, (str, bytes), tuple) self._label = utils.to_collection(label, (str, bytes), tuple) try: filters = self._get_filters(rating_range, min_len) for record in itertools.islice(self._filtered_iter(filters), limit): yield types.Record(text=record.pop("text"), meta=record) finally: self._subset = None self._label = None