"""
Matches
-------
:mod:`textacy.extract.matches`: Extract matching spans from a document or sentence
using spaCy's built-in matcher or regular expressions.
"""
from __future__ import annotations
import re
from typing import Callable, Dict, Iterable, List, Optional, Pattern, Union
from spacy.matcher import Matcher
from spacy.tokens import Span
from .. import constants, errors, types
[docs]def token_matches(
doclike: types.DocLike,
patterns: str | List[str] | List[Dict[str, str]] | List[List[Dict[str, str]]],
*,
on_match: Optional[Callable] = None,
) -> Iterable[Span]:
"""
Extract ``Span`` s from a document or sentence matching one or more patterns
of per-token attr:value pairs, with optional quantity qualifiers.
Args:
doclike
patterns:
One or multiple patterns to match against ``doclike``
using a :class:`spacy.matcher.Matcher`.
If List[dict] or List[List[dict]], each pattern is specified
as attr: value pairs per token, with optional quantity qualifiers:
- ``[{"POS": "NOUN"}]`` matches singular or plural nouns,
like "friend" or "enemies"
- ``[{"POS": "PREP"}, {"POS": "DET", "OP": "?"}, {"POS": "ADJ", "OP": "?"}, {"POS": "NOUN", "OP": "+"}]``
matches prepositional phrases, like "in the future" or "from the distant past"
- ``[{"IS_DIGIT": True}, {"TAG": "NNS"}]`` matches numbered plural nouns,
like "60 seconds" or "2 beers"
- ``[{"POS": "PROPN", "OP": "+"}, {}]`` matches proper nouns and
whatever word follows them, like "Burton DeWilde yaaasss"
If str or List[str], each pattern is specified as one or more
per-token patterns separated by whitespace where attribute, value,
and optional quantity qualifiers are delimited by colons. Note that
boolean and integer values have special syntax --- "bool(val)" and
"int(val)", respectively --- and that wildcard tokens still need
a colon between the (empty) attribute and value strings.
- ``"POS:NOUN"`` matches singular or plural nouns
- ``"POS:PREP POS:DET:? POS:ADJ:? POS:NOUN:+"`` matches prepositional phrases
- ``"IS_DIGIT:bool(True) TAG:NNS"`` matches numbered plural nouns
- ``"POS:PROPN:+ :"`` matches proper nouns and whatever word follows them
Also note that these pattern strings don't support spaCy v2.1's
"extended" pattern syntax; if you need such complex patterns, it's
probably better to use a List[dict] or List[List[dict]], anyway.
on_match: Callback function to act on matches.
Takes the arguments ``matcher``, ``doclike``, ``i`` and ``matches``.
Yields:
Next matching ``Span`` in ``doclike``, in order of appearance
Raises:
TypeError
ValueError
See Also:
- https://spacy.io/usage/rule-based-matching
- https://spacy.io/api/matcher
""" # noqa: E501
if isinstance(patterns, str):
patterns = [_make_pattern_from_string(patterns)]
elif isinstance(patterns, (list, tuple)):
if all(isinstance(item, str) for item in patterns):
patterns = [_make_pattern_from_string(pattern) for pattern in patterns]
elif all(isinstance(item, dict) for item in patterns):
patterns = [patterns]
elif all(isinstance(item, (list, tuple)) for item in patterns):
pass # already in the right format!
else:
raise TypeError(
errors.type_invalid_msg(
"patterns",
type(patterns),
Union[
str, List[str], List[Dict[str, str]], List[List[Dict[str, str]]]
],
)
)
else:
raise TypeError(
errors.type_invalid_msg(
"patterns",
type(patterns),
Union[str, List[str], List[Dict[str, str]], List[List[Dict[str, str]]]],
)
)
matcher = Matcher(doclike.vocab)
matcher.add("match", patterns, on_match=on_match)
for match in matcher(doclike, as_spans=True):
yield match
def _make_pattern_from_string(patstr: str) -> List[Dict[str, str]]:
pattern = []
for tokpatstr in constants.RE_MATCHER_TOKPAT_DELIM.split(patstr):
parts = tokpatstr.split(":")
if 2 <= len(parts) <= 3:
attr = parts[0]
attr_val = parts[1]
if attr and attr_val:
# handle special bool and int attribute values
special_val = constants.RE_MATCHER_SPECIAL_VAL.match(attr_val)
if special_val:
attr_val = eval(special_val.group(0))
tokpat = {attr: attr_val}
# handle wildcard tokens
else:
tokpat = {}
# handle quantifier ops
try:
op_val = parts[2]
if op_val in constants.MATCHER_VALID_OPS:
tokpat["OP"] = op_val
else:
raise ValueError(
errors.value_invalid_msg(
"op", op_val, constants.MATCHER_VALID_OPS
)
)
except IndexError:
pass
pattern.append(tokpat)
else:
raise ValueError(
f"pattern string '{patstr}' is invalid; "
"each element in a pattern string must contain an attribute, "
"a corresponding value, and an optional quantity qualifier, "
"delimited by colons, like attr:value:op"
)
return pattern
[docs]def regex_matches(
doclike: types.DocLike,
pattern: str | Pattern,
*,
alignment_mode: str = "strict", # Literal["strict", "contract", "expand"]
) -> Iterable[Span]:
"""
Extract ``Span`` s from a document or sentence whose full texts match against
a regular expression ``pattern``.
Args:
doclike
pattern: Valid regular expression against which to match document text,
either as a string or compiled pattern object.
alignment_mode: How character indices of regex matches snap to spaCy token
boundaries. If "strict", only exact alignments are included (no snapping);
if "contract", tokens completely within the character span are included;
if "expand", tokens at least partially covered by the character span
are included.
Yields:
Next matching ``Span``.
"""
for match in re.finditer(pattern, doclike.text):
start_char_idx, end_char_idx = match.span()
span = doclike.char_span(
start_char_idx, end_char_idx, alignment_mode=alignment_mode
)
# Doc.char_span() returns None if character indices don’t map to a valid span
if span is not None:
yield span