Source code for textacy.similarity.hybrid
"""
Hybrid Metrics
--------------
:mod:`textacy.similarity.hybrid`: Normalized similarity metrics that combine edit-,
token-, and/or sequence-based algorithms.
"""
from __future__ import annotations
from typing import Callable, Sequence
from . import edits
from .. import constants
[docs]def token_sort_ratio(s1: str | Sequence[str], s2: str | Sequence[str]) -> float:
"""
Measure the similarity between two strings or sequences of strings
using Levenshtein distance, only with non-alphanumeric characters removed
and the ordering of tokens in each sorted before comparison.
Args:
s1
s2
Returns:
Similarity between ``s1`` and ``s2`` in the interval [0.0, 1.0],
where larger values correspond to more similar strings.
See Also:
:func:`textacy.similarity.edits.levenshtein()`
"""
str1 = _to_prepared_str(s1)
str2 = _to_prepared_str(s2)
return edits.levenshtein(str1, str2)
def _to_prepared_str(s: str | Sequence[str]) -> str:
"""
Remove all characters from ``s`` except letters and numbers, strip whitespace,
and force everything to lower-case; then sort tokens before re-joining into
a single string.
"""
tokens = (
constants.RE_ALNUM.findall(s.lower())
if isinstance(s, str)
else [tok.lower().strip() for tok in s]
)
return " ".join(sorted(tokens))
[docs]def monge_elkan(
seq1: Sequence[str],
seq2: Sequence[str],
sim_func: Callable[[str, str], float] = edits.levenshtein,
) -> float:
"""
Measure the similarity between two sequences of strings using the (symmetric)
Monge-Elkan method, which takes the average of the maximum pairwise similarity
between the tokens in each sequence as compared to those in the other sequence.
Args:
seq1
seq2
sim_func: Callable that computes a string-to-string similarity metric;
by default, Levenshtein edit distance.
Returns:
Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0],
where larger values correspond to more similar strings.
See Also:
:func:`textacy.similarity.edits.levenshtein()`
"""
if not seq1 or not seq2:
return 0.0
sum_maxsim1 = sum(
max(sim_func(tok1, tok2) for tok2 in seq2)
for tok1 in seq1
)
sum_maxsim2 = sum(
max(sim_func(tok2, tok1) for tok1 in seq1)
for tok2 in seq2
)
return ((sum_maxsim1 / len(seq1)) + (sum_maxsim2 / len(seq2))) / 2