Source code for textacy.extract.matches


:mod:`textacy.extract.matches`: Extract matching spans from a document or sentence
using spaCy's built-in matcher or regular expressions.
from __future__ import annotations

import re
from typing import Callable, Dict, Iterable, List, Optional, Pattern, Union

from spacy.matcher import Matcher
from spacy.tokens import Span

from .. import constants, errors, types

[docs]def token_matches( doclike: types.DocLike, patterns: str | List[str] | List[Dict[str, str]] | List[List[Dict[str, str]]], *, on_match: Optional[Callable] = None, ) -> Iterable[Span]: """ Extract ``Span`` s from a document or sentence matching one or more patterns of per-token attr:value pairs, with optional quantity qualifiers. Args: doclike patterns: One or multiple patterns to match against ``doclike`` using a :class:`spacy.matcher.Matcher`. If List[dict] or List[List[dict]], each pattern is specified as attr: value pairs per token, with optional quantity qualifiers: - ``[{"POS": "NOUN"}]`` matches singular or plural nouns, like "friend" or "enemies" - ``[{"POS": "PREP"}, {"POS": "DET", "OP": "?"}, {"POS": "ADJ", "OP": "?"}, {"POS": "NOUN", "OP": "+"}]`` matches prepositional phrases, like "in the future" or "from the distant past" - ``[{"IS_DIGIT": True}, {"TAG": "NNS"}]`` matches numbered plural nouns, like "60 seconds" or "2 beers" - ``[{"POS": "PROPN", "OP": "+"}, {}]`` matches proper nouns and whatever word follows them, like "Burton DeWilde yaaasss" If str or List[str], each pattern is specified as one or more per-token patterns separated by whitespace where attribute, value, and optional quantity qualifiers are delimited by colons. Note that boolean and integer values have special syntax --- "bool(val)" and "int(val)", respectively --- and that wildcard tokens still need a colon between the (empty) attribute and value strings. - ``"POS:NOUN"`` matches singular or plural nouns - ``"POS:PREP POS:DET:? POS:ADJ:? POS:NOUN:+"`` matches prepositional phrases - ``"IS_DIGIT:bool(True) TAG:NNS"`` matches numbered plural nouns - ``"POS:PROPN:+ :"`` matches proper nouns and whatever word follows them Also note that these pattern strings don't support spaCy v2.1's "extended" pattern syntax; if you need such complex patterns, it's probably better to use a List[dict] or List[List[dict]], anyway. on_match: Callback function to act on matches. Takes the arguments ``matcher``, ``doclike``, ``i`` and ``matches``. Yields: Next matching ``Span`` in ``doclike``, in order of appearance Raises: TypeError ValueError See Also: - - """ # noqa: E501 if isinstance(patterns, str): patterns = [_make_pattern_from_string(patterns)] elif isinstance(patterns, (list, tuple)): if all(isinstance(item, str) for item in patterns): patterns = [_make_pattern_from_string(pattern) for pattern in patterns] elif all(isinstance(item, dict) for item in patterns): patterns = [patterns] elif all(isinstance(item, (list, tuple)) for item in patterns): pass # already in the right format! else: raise TypeError( errors.type_invalid_msg( "patterns", type(patterns), Union[ str, List[str], List[Dict[str, str]], List[List[Dict[str, str]]] ], ) ) else: raise TypeError( errors.type_invalid_msg( "patterns", type(patterns), Union[str, List[str], List[Dict[str, str]], List[List[Dict[str, str]]]], ) ) matcher = Matcher(doclike.vocab) matcher.add("match", patterns, on_match=on_match) for match in matcher(doclike, as_spans=True): yield match
def _make_pattern_from_string(patstr: str) -> List[Dict[str, str]]: pattern = [] for tokpatstr in constants.RE_MATCHER_TOKPAT_DELIM.split(patstr): parts = tokpatstr.split(":") if 2 <= len(parts) <= 3: attr = parts[0] attr_val = parts[1] if attr and attr_val: # handle special bool and int attribute values special_val = constants.RE_MATCHER_SPECIAL_VAL.match(attr_val) if special_val: attr_val = eval( tokpat = {attr: attr_val} # handle wildcard tokens else: tokpat = {} # handle quantifier ops try: op_val = parts[2] if op_val in constants.MATCHER_VALID_OPS: tokpat["OP"] = op_val else: raise ValueError( errors.value_invalid_msg( "op", op_val, constants.MATCHER_VALID_OPS ) ) except IndexError: pass pattern.append(tokpat) else: raise ValueError( f"pattern string '{patstr}' is invalid; " "each element in a pattern string must contain an attribute, " "a corresponding value, and an optional quantity qualifier, " "delimited by colons, like attr:value:op" ) return pattern
[docs]def regex_matches( doclike: types.DocLike, pattern: str | Pattern, *, alignment_mode: str = "strict", # Literal["strict", "contract", "expand"] ) -> Iterable[Span]: """ Extract ``Span`` s from a document or sentence whose full texts match against a regular expression ``pattern``. Args: doclike pattern: Valid regular expression against which to match document text, either as a string or compiled pattern object. alignment_mode: How character indices of regex matches snap to spaCy token boundaries. If "strict", only exact alignments are included (no snapping); if "contract", tokens completely within the character span are included; if "expand", tokens at least partially covered by the character span are included. Yields: Next matching ``Span``. """ for match in re.finditer(pattern, doclike.text): start_char_idx, end_char_idx = match.span() span = doclike.char_span( start_char_idx, end_char_idx, alignment_mode=alignment_mode ) # Doc.char_span() returns None if character indices don’t map to a valid span if span is not None: yield span