Source code for textacy.similarity.hybrid

Hybrid Metrics

:mod:`textacy.similarity.hybrid`: Normalized similarity metrics that combine edit-,
token-, and/or sequence-based algorithms.
from __future__ import annotations

from typing import Callable, Sequence

from . import edits
from .. import constants

[docs]def token_sort_ratio(s1: str | Sequence[str], s2: str | Sequence[str]) -> float: """ Measure the similarity between two strings or sequences of strings using Levenshtein distance, only with non-alphanumeric characters removed and the ordering of tokens in each sorted before comparison. Args: s1 s2 Returns: Similarity between ``s1`` and ``s2`` in the interval [0.0, 1.0], where larger values correspond to more similar strings. See Also: :func:`textacy.similarity.edits.levenshtein()` """ str1 = _to_prepared_str(s1) str2 = _to_prepared_str(s2) return edits.levenshtein(str1, str2)
def _to_prepared_str(s: str | Sequence[str]) -> str: """ Remove all characters from ``s`` except letters and numbers, strip whitespace, and force everything to lower-case; then sort tokens before re-joining into a single string. """ tokens = ( constants.RE_ALNUM.findall(s.lower()) if isinstance(s, str) else [tok.lower().strip() for tok in s] ) return " ".join(sorted(tokens))
[docs]def monge_elkan( seq1: Sequence[str], seq2: Sequence[str], sim_func: Callable[[str, str], float] = edits.levenshtein, ) -> float: """ Measure the similarity between two sequences of strings using the (symmetric) Monge-Elkan method, which takes the average of the maximum pairwise similarity between the tokens in each sequence as compared to those in the other sequence. Args: seq1 seq2 sim_func: Callable that computes a string-to-string similarity metric; by default, Levenshtein edit distance. Returns: Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0], where larger values correspond to more similar strings. See Also: :func:`textacy.similarity.edits.levenshtein()` """ if not seq1 or not seq2: return 0.0 sum_maxsim1 = sum( max(sim_func(tok1, tok2) for tok2 in seq2) for tok1 in seq1 ) sum_maxsim2 = sum( max(sim_func(tok2, tok1) for tok1 in seq1) for tok2 in seq2 ) return ((sum_maxsim1 / len(seq1)) + (sum_maxsim2 / len(seq2))) / 2