Source code for textacy.extract.triples

"""
Triples
-------

:mod:`textacy.extract.triples`: Extract structured triples from a document or sentence
through rule-based pattern-matching of the annotated tokens.
"""
from __future__ import annotations

import collections
from operator import attrgetter
from typing import Iterable, List, Optional, Pattern, Tuple

from cytoolz import itertoolz
from spacy.symbols import (
    AUX, VERB,
    agent, attr, aux, auxpass, csubj, csubjpass, dobj, neg, nsubj, nsubjpass, obj, pobj, xcomp,
)
from spacy.tokens import Doc, Span, Token

from . import matches
from .. import constants, types, utils


_NOMINAL_SUBJ_DEPS = {nsubj, nsubjpass}
_CLAUSAL_SUBJ_DEPS = {csubj, csubjpass}
_ACTIVE_SUBJ_DEPS = {csubj, nsubj}
_VERB_MODIFIER_DEPS = {aux, auxpass, neg}

SVOTriple: Tuple[List[Token], List[Token], List[Token]] = collections.namedtuple(
    "SVOTriple", ["subject", "verb", "object"]
)
SSSTriple: Tuple[List[Token], List[Token], List[Token]] = collections.namedtuple(
    "SSSTriple", ["entity", "cue", "fragment"]
)
DQTriple: Tuple[List[Token], List[Token], Span] = collections.namedtuple(
    "DQTriple", ["speaker", "cue", "content"]
)


[docs]def subject_verb_object_triples(doclike: types.DocLike) -> Iterable[SVOTriple]:
    """
    Extract an ordered sequence of subject-verb-object triples from a document
    or sentence.

    Args:
        doclike

    Yields:
        Next SVO triple as (subject, verb, object), in approximate order of appearance.
    """
    if isinstance(doclike, Span):
        sents = [doclike]
    else:
        sents = doclike.sents

    for sent in sents:
        # connect subjects/objects to direct verb heads
        # and expand them to include conjuncts, compound nouns, ...
        verb_sos = collections.defaultdict(lambda: collections.defaultdict(set))
        for tok in sent:
            head = tok.head
            # ensure entry for all verbs, even if empty
            # to catch conjugate verbs without direct subject/object deps
            if tok.pos == VERB:
                _ = verb_sos[tok]
            # nominal subject of active or passive verb
            if tok.dep in _NOMINAL_SUBJ_DEPS:
                if head.pos == VERB:
                    verb_sos[head]["subjects"].update(expand_noun(tok))
            # clausal subject of active or passive verb
            elif tok.dep in _CLAUSAL_SUBJ_DEPS:
                if head.pos == VERB:
                    verb_sos[head]["subjects"].update(tok.subtree)
            # nominal direct object of transitive verb
            elif tok.dep == dobj:
                if head.pos == VERB:
                    verb_sos[head]["objects"].update(expand_noun(tok))
            # prepositional object acting as agent of passive verb
            elif tok.dep == pobj:
                if head.dep == agent and head.head.pos == VERB:
                    verb_sos[head.head]["objects"].update(expand_noun(tok))
            # open clausal complement, but not as a secondary predicate
            elif tok.dep == xcomp:
                if (
                    head.pos == VERB
                    and not any(child.dep == dobj for child in head.children)
                ):
                    # TODO: just the verb, or the whole tree?
                    # verb_sos[verb]["objects"].update(expand_verb(tok))
                    verb_sos[head]["objects"].update(tok.subtree)
        # fill in any indirect relationships connected via verb conjuncts
        for verb, so_dict in verb_sos.items():
            conjuncts = verb.conjuncts
            if so_dict.get("subjects"):
                for conj in conjuncts:
                    conj_so_dict = verb_sos.get(conj)
                    if conj_so_dict and not conj_so_dict.get("subjects"):
                        conj_so_dict["subjects"].update(so_dict["subjects"])
            if not so_dict.get("objects"):
                so_dict["objects"].update(
                    obj
                    for conj in conjuncts
                    for obj in verb_sos.get(conj, {}).get("objects", [])
                )
        # expand verbs and restructure into svo triples
        for verb, so_dict in verb_sos.items():
            if so_dict["subjects"] and so_dict["objects"]:
                yield SVOTriple(
                    subject=sorted(so_dict["subjects"], key=attrgetter("i")),
                    verb=sorted(expand_verb(verb), key=attrgetter("i")),
                    object=sorted(so_dict["objects"], key=attrgetter("i")),
                )


[docs]def semistructured_statements(
    doclike: types.DocLike,
    *,
    entity: str | Pattern,
    cue: str,
    fragment_len_range: Optional[Tuple[Optional[int], Optional[int]]] = None,
) -> Iterable[SSSTriple]:
    """
    Extract "semi-structured statements" from a document as a sequence of
    (entity, cue, fragment) triples.

    Args:
        doclike
        entity: Noun or noun phrase of interest expressed as a regular expression
            pattern string (e.g. ``"[Gg]lobal [Ww]arming"``) or compiled object
            (e.g. ``re.compile("global warming", re.IGNORECASE)``).
        cue: Verb lemma with which ``entity`` is associated (e.g. "be", "have", "say").
        fragment_len_range: Filter statements to those whose fragment length in tokens
            is within the specified [low, high) interval. Both low and high values
            must be specified, but a null value for either is automatically replaced
            by safe default values. None (default) skips filtering by fragment length.

    Yields:
        Next matching triple, consisting of (entity, cue, fragment),
        in order of appearance.

    Notes:
        Inspired by N. Diakopoulos, A. Zhang, A. Salway. Visual Analytics of
        Media Frames in Online News and Blogs. IEEE InfoVis Workshop on Text
        Visualization. October, 2013.

        Which itself was inspired by by Salway, A.; Kelly, L.; Skadiņa, I.; and
        Jones, G. 2010. Portable Extraction of Partially Structured Facts from
        the Web. In Proc. ICETAL 2010, LNAI 6233, 345-356. Heidelberg, Springer.
    """
    if fragment_len_range is not None:
        fragment_len_range = utils.validate_and_clip_range(
            fragment_len_range, (1, 1000), int
        )
    for entity_cand in matches.regex_matches(doclike, entity, alignment_mode="strict"):
        # is the entity candidate a nominal subject?
        if entity_cand.root.dep in _NOMINAL_SUBJ_DEPS:
            cue_cand = entity_cand.root.head
            # is the cue candidate a verb with matching lemma?
            if cue_cand.pos in {VERB, AUX} and cue_cand.lemma_ == cue:
                frag_cand = None
                for tok in cue_cand.children:
                    if (
                        tok.dep in {attr, dobj, obj}
                        or tok.dep_ == "dative"
                        or (
                            tok.dep == xcomp
                            and not any(child.dep == dobj for child in cue_cand.children)
                        )
                    ):
                        subtoks = list(tok.subtree)
                        if (
                            fragment_len_range is None
                            or fragment_len_range[0] <= len(subtoks) < fragment_len_range[1]
                        ):
                            frag_cand = subtoks
                            break
                if frag_cand is not None:
                    yield SSSTriple(
                        entity=list(entity_cand),
                        cue=sorted(expand_verb(cue_cand), key=attrgetter("i")),
                        fragment=sorted(frag_cand, key=attrgetter("i")),
                    )


[docs]def direct_quotations(doc: Doc) -> Iterable[DQTriple]:
    """
    Extract direct quotations with an attributable speaker from a document
    using simple rules and patterns. Does not extract indirect or mixed quotations!

    Args:
        doc

    Yields:
        Next direct quotation in ``doc`` as a (speaker, cue, content) triple.

    Notes:
        Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic
        Tagging of Reported Speech in Newspaper Articles".
    """
    # TODO: train a model to do this instead, maybe similar to entity recognition
    try:
        _reporting_verbs = constants.REPORTING_VERBS[doc.lang_]
    except KeyError:
        raise ValueError(
            f"direct quotation extraction is not implemented for lang='{doc.lang_}', "
            f"only {sorted(constants.REPORTING_VERBS.keys())}"
        )
    qtok_idxs = [tok.i for tok in doc if tok.is_quote]
    if len(qtok_idxs) % 2 != 0:
        raise ValueError(
            f"{len(qtok_idxs)} quotation marks found, indicating an unclosed quotation; "
            "given the limitations of this method, it's safest to bail out "
            "rather than guess which quotation is unclosed"
        )
    qtok_pair_idxs = list(itertoolz.partition(2, qtok_idxs))
    for qtok_start_idx, qtok_end_idx in qtok_pair_idxs:
        content = doc[qtok_start_idx : qtok_end_idx + 1]
        cue = None
        speaker = None
        # filter quotations by content
        if (
            # quotations should have at least a couple tokens
            # excluding the first/last quotation mark tokens
            len(content) < 4
            # filter out titles of books and such, if possible
            or all(
                tok.is_title
                for tok in content
                # if tok.pos in {NOUN, PROPN}
                if not (tok.is_punct or tok.is_stop)
            )
            # TODO: require closing punctuation before the quotation mark?
            # content[-2].is_punct is False
        ):
            continue
        # get window of adjacent/overlapping sentences
        window_sents = (
            sent
            for sent in doc.sents
            # these boundary cases are a subtle bit of work...
            if (
                (sent.start < qtok_start_idx and sent.end >= qtok_start_idx - 1)
                or (sent.start <= qtok_end_idx + 1 and sent.end > qtok_end_idx)
            )
        )
        # get candidate cue verbs in window
        cue_cands = [
            tok
            for sent in window_sents
            for tok in sent
            if (
                tok.pos == VERB
                and tok.lemma_ in _reporting_verbs
                # cue verbs must occur *outside* any quotation content
                and not any(
                    qts_idx <= tok.i <= qte_idx
                    for qts_idx, qte_idx in qtok_pair_idxs
                )
            )
        ]
        # sort candidates by proximity to quote content
        cue_cands = sorted(
            cue_cands,
            key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)),
        )
        for cue_cand in cue_cands:
            if cue is not None:
                break
            for speaker_cand in cue_cand.children:
                if speaker_cand.dep in _ACTIVE_SUBJ_DEPS:
                    cue = expand_verb(cue_cand)
                    speaker = expand_noun(speaker_cand)
                    break
        if content and cue and speaker:
            yield DQTriple(
                speaker=sorted(speaker, key=attrgetter("i")),
                cue=sorted(cue, key=attrgetter("i")),
                content=content,
            )


[docs]def expand_noun(tok: Token) -> List[Token]:
    """Expand a noun token to include all associated conjunct and compound nouns."""
    tok_and_conjuncts = [tok] + list(tok.conjuncts)
    compounds = [
        child
        for tc in tok_and_conjuncts
        for child in tc.children
        # TODO: why doesn't compound import from spacy.symbols?
        if child.dep_ == "compound"
    ]
    return tok_and_conjuncts + compounds


[docs]def expand_verb(tok: Token) -> List[Token]:
    """Expand a verb token to include all associated auxiliary and negation tokens."""
    verb_modifiers = [
        child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS
    ]
    return [tok] + verb_modifiers
Source code for textacy.extract.triples

Navigation

Related Topics