Source code for textacy.resources.concept_net

"""
ConceptNet
----------

ConceptNet is a multilingual knowledge base, representing common words and phrases
and the common-sense relationships between them. This information is collected from
a variety of sources, including crowd-sourced resources (e.g. Wiktionary, Open Mind
Common Sense), games with a purpose (e.g. Verbosity, nadya.jp), and expert-created
resources (e.g. WordNet, JMDict).

The interface in textacy gives access to several key relationships between terms
that are useful in a variety of NLP tasks:

    - antonyms: terms that are opposites of each other in some relevant way
    - hyponyms: terms that are subtypes or specific instances of other terms
    - meronyms: terms that are parts of other terms
    - synonyms: terms that are sufficiently similar that they may be used interchangeably
"""
import collections
import logging

from spacy.tokens import Span, Token
from tqdm import tqdm

from .. import constants
from .. import io as tio
from .. import utils
from .base import Resource


LOGGER = logging.getLogger(__name__)

NAME = "concept_net"
META = {
    "site_url": "http://conceptnet.io",
    "publication_url": "https://arxiv.org/abs/1612.03975",
    "description": (
        "An open, multilingual semantic network of general knowledge, "
        "designed to help computers understand the meanings of words."
    ),
}
DOWNLOAD_ROOT = "https://s3.amazonaws.com/conceptnet/downloads/{year}/edges/conceptnet-assertions-{version}.csv.gz"


[docs]class ConceptNet(Resource):
    """
    Interface to ConceptNet, a multilingual knowledge base representing common words
    and phrases and the common-sense relationships between them.

    Download the data (one time only!), and save its contents to disk::

        >>> import textacy.resources
        >>> rs = textacy.resources.ConceptNet()
        >>> rs.download()
        >>> rs.info
        {'name': 'concept_net',
         'site_url': 'http://conceptnet.io',
         'publication_url': 'https://arxiv.org/abs/1612.03975',
         'description': 'An open, multilingual semantic network of general knowledge, designed to help computers understand the meanings of words.'}

    Access other same-language terms related to a given term in a variety of ways::

        >>> rs.get_synonyms("spouse", lang="en", sense="n")
        ['mate', 'married person', 'better half', 'partner']
        >>> rs.get_antonyms("love", lang="en", sense="v")
        ['detest', 'hate', 'loathe']
        >>> rs.get_hyponyms("marriage", lang="en", sense="n")
        ['cohabitation situation', 'union', 'legal agreement', 'ritual', 'family', 'marital status']

    **Note:** The very first time a given relationship is accessed, the full ConceptNet db
    must be parsed and split for fast future access. This can take a couple minutes;
    be patient.

    When passing a spaCy ``Token`` or ``Span``, the corresponding ``lang`` and ``sense``
    are inferred automatically from the object::

        >>> text = "The quick brown fox jumps over the lazy dog."
        >>> doc = textacy.make_spacy_doc(text, lang="en")
        >>> rs.get_synonyms(doc[1])  # quick
        ['flying', 'fast', 'rapid', 'ready', 'straightaway', 'nimble', 'speedy', 'warm']
        >>> rs.get_synonyms(doc[4:5])  # jumps over
        ['leap', 'startle', 'hump', 'flinch', 'jump off', 'skydive', 'jumpstart', ...]

    Many terms won't have entries, for actual linguistic reasons or because the db's
    coverage of a given language's vocabulary isn't comprehensive::

        >>> rs.get_meronyms(doc[3])  # fox
        []
        >>> rs.get_antonyms(doc[7])  # lazy
        []

    Args:
        data_dir (str or :class:`pathlib.Path`): Path to directory on disk
            under which resource data is stored, i.e. ``/path/to/data_dir/concept_net``.
        version ({"5.7.0", "5.6.0", "5.5.5"}): Version string of the ConceptNet db
            to use. Since newer versions typically represent improvements over earlier
            versions, you'll probably want "5.7.0" (the default value).
    """

    _version_years = {"5.7.0": 2019, "5.6.0": 2018, "5.5.5": 2017}
    _pos_map = {"NOUN": "n", "VERB": "v", "ADJ": "a", "ADV": "r"}

    def __init__(
        self, data_dir=constants.DEFAULT_DATA_DIR.joinpath(NAME), version="5.7.0",
    ):
        super().__init__(NAME, meta=META)
        self.version = version
        self.data_dir = utils.to_path(data_dir).resolve().joinpath(self.version)
        self._filename = "conceptnet-assertions-{}.csv.gz".format(self.version)
        self._filepath = self.data_dir.joinpath(self._filename)
        self._antonyms = None
        self._hyponyms = None
        self._meronyms = None
        self._synonyms = None

[docs]    def download(self, *, force=False):
        """
        Download resource data as a gzipped csv file,
        then save it to disk under the :attr:`ConceptNet.data_dir` directory.

        Args:
            force (bool): If True, download resource data, even if it already
                exists on disk; otherwise, don't re-download the data.
        """
        url = DOWNLOAD_ROOT.format(
            version=self.version, year=self._version_years[self.version]
        )
        tio.download_file(
            url, filename=self._filename, dirpath=self.data_dir, force=force,
        )

    @property
    def filepath(self):
        """
        str: Full path on disk for the ConceptNet gzipped csv file
        corresponding to the given :attr:`ConceptNet.data_dir`.
        """
        if self._filepath.is_file():
            return str(self._filepath)
        else:
            return None

    def _get_relation_data(self, relation, is_symmetric=False):
        if not self.filepath:
            raise OSError(
                "resource file {} not found;\n"
                "has the data been downloaded yet?".format(self._filepath)
            )
        rel_fname = "{}.json.gz".format(_split_uri(relation)[1].lower())
        rel_fpath = self.data_dir.joinpath(rel_fname)
        if rel_fpath.is_file():
            LOGGER.debug("loading data for '%s' relation from %s", relation, rel_fpath)
            return next(
                tio.read_json(rel_fpath, mode="rt", encoding="utf-8", lines=False)
            )
        else:
            rel_data = collections.defaultdict(
                lambda: collections.defaultdict(lambda: collections.defaultdict(set))
            )
            LOGGER.info(
                "preparing data for '%s' relation; this may take a while...", relation
            )
            rows = tio.read_csv(self.filepath, delimiter="\t", quoting=1)
            with tqdm() as pbar:
                for row in rows:
                    pbar.update(1)
                    _, rel_type, start_uri, end_uri, _ = row
                    if rel_type < relation:
                        continue
                    elif rel_type > relation:
                        break
                    start_lang, start_term, start_sense = _parse_concept_uri(start_uri)
                    end_lang, end_term, end_sense = _parse_concept_uri(end_uri)
                    if start_lang == end_lang and start_term != end_term:
                        rel_data[start_lang][start_term][start_sense].add(end_term)
                        if is_symmetric:
                            rel_data[start_lang][end_term][end_sense].add(start_term)
            # make relation data json-able (i.e. cast set => list)
            for terms in rel_data.values():
                for senses in terms.values():
                    for sense, rel_terms in senses.items():
                        senses[sense] = list(rel_terms)
            LOGGER.info("saving data for '%s' relation to %s", relation, rel_fpath)
            tio.write_json(rel_data, rel_fpath, mode="wt", encoding="utf-8")
            return rel_data

    def _get_relation_values(self, rel_data, term, lang=None, sense=None):
        """
        Args:
            term (str or :class:`spacy.tokens.Token` or :class:`spacy.tokens.Span`)
            lang (str)
            sense (str)

        Returns:
            List[str]
        """
        if lang is not None and lang not in rel_data:
            raise ValueError(
                "lang='{}' is invalid; valid langs are {}".format(
                    lang, sorted(rel_data.keys())
                )
            )
        if sense is not None:
            # let's be kind and automatically convert standard POS strings
            if sense in self._pos_map.keys():
                sense = self._pos_map[sense]
            # otherwise, return no results as is done when auto-inferrence of sense
            # results in an invalid value
            elif sense not in self._pos_map.values():
                return []
        if isinstance(term, str):
            if not (lang and sense):
                raise ValueError(
                    "if `term` is a string, both `lang` and `sense` must be specified"
                )
            else:
                norm_terms = [term.replace(" ", "_").lower()]
        elif isinstance(term, (Span, Token)):
            norm_terms = [
                term.text.replace(" ", "_").lower(),
                term.lemma_.replace(" ", "_").lower(),
            ]
            if not lang:
                try:
                    lang = term.lang_  # token
                except AttributeError:
                    lang = term[0].lang_  # span
            if not sense:
                try:
                    sense = self._pos_map[term.pos_]  # token
                except AttributeError:
                    sense = self._pos_map[term[0].pos_]  # span
                except KeyError:
                    return []
        else:
            raise TypeError(
                "`term` must be one of {}, not {}".format({str, Span, Token}, type(term))
            )
        # TODO: implement an out-of-vocabulary strategy? for example,
        # https://github.com/commonsense/conceptnet-numberbatch#out-of-vocabulary-strategy
        for norm_term in norm_terms:
            try:
                return rel_data[lang][norm_term].get(sense, [])
            except KeyError:
                pass
        return []

    @property
    def antonyms(self):
        """
        Dict[str, Dict[str, Dict[str, List[str]]]]: Mapping of language code to term to
        sense to set of term's antonyms -- opposites of the term in some relevant way,
        like being at opposite ends of a scale or fundamentally similar but with
        a key difference between them -- such as black <=> white or hot <=> cold. Note
        that this relationship is symmetric.

        Based on the "/r/Antonym" relation in ConceptNet.
        """
        if not self._antonyms:
            self._antonyms = self._get_relation_data("/r/Antonym", is_symmetric=True)
        return self._antonyms

[docs]    def get_antonyms(self, term, *, lang=None, sense=None):
        """
        Args:
            term (str or :class:`spacy.tokens.Token` or :class:`spacy.tokens.Span`)
            lang (str): Standard code for the language of ``term``.
            sense (str): Sense in which ``term`` is used in context, which in practice
                is just its part of speech. Valid values: "n" or "NOUN", "v" or "VERB",
                "a" or "ADJ", "r" or "ADV".

        Returns:
            List[str]
        """
        return self._get_relation_values(self.antonyms, term, lang=lang, sense=sense)

    @property
    def hyponyms(self):
        """
        Dict[str, Dict[str, Dict[str, List[str]]]]: Mapping of language code to term to
        sense to set of term's hyponyms -- subtypes or specific instances of the term --
        such as car => vehicle or Chicago => city. Every A is a B.

        Based on the "/r/IsA" relation in ConceptNet.
        """
        if not self._hyponyms:
            self._hyponyms = self._get_relation_data("/r/IsA", is_symmetric=False)
        return self._hyponyms

[docs]    def get_hyponyms(self, term, *, lang=None, sense=None):
        """
        Args:
            term (str or :class:`spacy.tokens.Token` or :class:`spacy.tokens.Span`)
            lang (str): Standard code for the language of ``term``.
            sense (str): Sense in which ``term`` is used in context, which in practice
                is just its part of speech. Valid values: "n" or "NOUN", "v" or "VERB",
                "a" or "ADJ", "r" or "ADV".

        Returns:
            List[str]
        """
        return self._get_relation_values(self.hyponyms, term, lang=lang, sense=sense)

    @property
    def meronyms(self):
        """
        Dict[str, Dict[str, Dict[str, List[str]]]]: Mapping of language code to term to
        sense to set of term's meronyms -- parts of the term -- such as gearshift => car.

        Based on the "/r/PartOf" relation in ConceptNet.
        """
        if not self._meronyms:
            self._meronyms = self._get_relation_data("/r/PartOf", is_symmetric=False)
        return self._meronyms

[docs]    def get_meronyms(self, term, *, lang=None, sense=None):
        """
        Args:
            term (str or :class:`spacy.tokens.Token` or :class:`spacy.tokens.Span`)
            lang (str): Standard code for the language of ``term``.
            sense (str): Sense in which ``term`` is used in context, which in practice
                is just its part of speech. Valid values: "n" or "NOUN", "v" or "VERB",
                "a" or "ADJ", "r" or "ADV".

        Returns:
            List[str]
        """
        return self._get_relation_values(self.meronyms, term, lang=lang, sense=sense)

    @property
    def synonyms(self):
        """
        Dict[str, Dict[str, Dict[str, List[str]]]]: Mapping of language code to term to
        sense to set of term's synonyms -- sufficiently similar concepts that they may
        be used interchangeably -- such as sunlight <=> sunshine. Note that
        this relationship is symmetric.

        Based on the "/r/Synonym" relation in ConceptNet.
        """
        if not self._synonyms:
            self._synonyms = self._get_relation_data("/r/Synonym", is_symmetric=True)
        return self._synonyms

[docs]    def get_synonyms(self, term, *, lang=None, sense=None):
        """
        Args:
            term (str or :class:`spacy.tokens.Token` or :class:`spacy.tokens.Span`)
            lang (str): Standard code for the language of ``term``.
            sense (str): Sense in which ``term`` is used in context, which in practice
                is just its part of speech. Valid values: "n" or "NOUN", "v" or "VERB",
                "a" or "ADJ", "r" or "ADV".

        Returns:
            List[str]
        """
        return self._get_relation_values(self.synonyms, term, lang=lang, sense=sense)


def _split_uri(uri):
    """
    Get slash-delimited parts of a ConceptNet URI.

    Args:
        uri (str)

    Returns:
        List[str]
    """
    uri = uri.lstrip("/")
    if not uri:
        return []
    return uri.split("/")


def _parse_concept_uri(uri):
    """
    Extract language, term, and sense from a ConceptNet "concept" URI.

    Args:
        uri (str)

    Returns:
        Tuple[str, str, str]: Language, term, sense.
    """
    if not uri.startswith("/c/"):
        raise ValueError("invalid concept uri: {}".format(uri))
    uri = _split_uri(uri)
    if len(uri) == 3:
        _, lang, term = uri
        sense = None
    elif len(uri) == 4:
        _, lang, term, sense = uri
    elif len(uri) > 4:
        _, lang, term, sense, *_ = uri
    elif len(uri) < 3:
        raise ValueError("not enough parts in uri: {}".format(uri))
    term = term.replace("_", " ")
    return lang, term, sense
Source code for textacy.resources.concept_net

Navigation

Related Topics