Source code for textacy.similarity.tokens

"""
Token-based Metrics
-------------------

:mod:`textacy.similarity.edits`: Normalized similarity metrics built on token-based
algorithms that identify and count similar tokens between one sequence and another,
and don't rely on the *ordering* of those tokens.
"""
import collections
import math
from typing import Iterable


[docs]def jaccard(seq1: Iterable[str], seq2: Iterable[str]) -> float: """ Measure the similarity between two sequences of strings as sets using the Jaccard index. Args: seq1 seq2 Returns: Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0], where larger values correspond to more similar sequences of strings Reference: https://en.wikipedia.org/wiki/Jaccard_index """ set1 = set(seq1) set2 = set(seq2) try: return len(set1 & set2) / len(set1 | set2) except ZeroDivisionError: return 0.0
[docs]def sorensen_dice(seq1: Iterable[str], seq2: Iterable[str]) -> float: """ Measure the similarity between two sequences of strings as sets using the Sørensen-Dice index, which is similar to the Jaccard index. Args: seq1 seq2 Returns: Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0], where larger values correspond to more similar sequences Reference: https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient """ set1 = set(seq1) set2 = set(seq2) try: return 2 * len(set1 & set2) / (len(set1) + len(set2)) except ZeroDivisionError: return 0.0
[docs]def tversky( seq1: Iterable[str], seq2: Iterable[str], *, alpha: float = 1.0, beta: float = 1.0 ) -> float: """ Measure the similarity between two sequences of strings as sets using the (symmetric) Tversky index, which is a generalization of Jaccard (``alpha=0.5, beta=2.0``) and Sørensen-Dice (``alpha=0.5, beta=1.0``). Args: seq1 seq2 alpha beta Returns: Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0], where larger values correspond to more similar sequences Reference: https://en.wikipedia.org/wiki/Tversky_index """ set1 = set(seq1) set2 = set(seq2) intersection = len(set1 & set2) set1_not_set2 = len(set1 - set2) set2_not_set1 = len(set2 - set1) a = min(set1_not_set2, set2_not_set1) b = max(set1_not_set2, set2_not_set1) try: return intersection / (intersection + (beta * (alpha * a + (1 - alpha) * b))) except ZeroDivisionError: return 0.0
[docs]def cosine(seq1: Iterable[str], seq2: Iterable[str]) -> float: """ Measure the similarity between two sequences of strings as sets using the Otsuka-Ochiai variation of cosine similarity (which is equivalent to the usual formulation when values are binary). Args: seq1 seq2 Returns: Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0], where larger values correspond to more similar sequences Reference: https://en.wikipedia.org/wiki/Cosine_similarity#Otsuka-Ochiai_coefficient """ set1 = set(seq1) set2 = set(seq2) try: return len(set1 & set2) / math.sqrt(len(set1) * len(set2)) except ZeroDivisionError: return 0.0
[docs]def bag(seq1: Iterable[str], seq2: Iterable[str]) -> float: """ Measure the similarity between two sequences of strings (*not* as sets) using the "bag distance" measure, which can be considered an approximation of edit distance. Args: seq1 seq2 Returns: Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0], where larger values correspond to more similar sequences Reference: Bartolini, Ilaria, Paolo Ciaccia, and Marco Patella. "String matching with metric trees using an approximate distance." International Symposium on String Processing and Information Retrieval. Springer, Berlin, Heidelberg, 2002. """ bag1 = collections.Counter(seq1) bag2 = collections.Counter(seq2) max_diff = max(sum((bag1 - bag2).values()), sum((bag2 - bag1).values())) max_len = max(sum(bag1.values()), sum(bag2.values())) try: return 1.0 - (max_diff / max_len) except ZeroDivisionError: return 0.0