Source code for textacy.similarity.tokens

"""
Token-based Metrics
-------------------

:mod:`textacy.similarity.edits`: Normalized similarity metrics built on token-based
algorithms that identify and count similar tokens between one sequence and another,
and don't rely on the *ordering* of those tokens.
"""
import collections
import math
from typing import Iterable


[docs]def jaccard(seq1: Iterable[str], seq2: Iterable[str]) -> float:
    """
    Measure the similarity between two sequences of strings as sets
    using the Jaccard index.

    Args:
        seq1
        seq2

    Returns:
        Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0],
        where larger values correspond to more similar sequences of strings

    Reference:
        https://en.wikipedia.org/wiki/Jaccard_index
    """
    set1 = set(seq1)
    set2 = set(seq2)
    try:
        return len(set1 & set2) / len(set1 | set2)
    except ZeroDivisionError:
        return 0.0


[docs]def sorensen_dice(seq1: Iterable[str], seq2: Iterable[str]) -> float:
    """
    Measure the similarity between two sequences of strings as sets
    using the Sørensen-Dice index, which is similar to the Jaccard index.

    Args:
        seq1
        seq2

    Returns:
        Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0],
        where larger values correspond to more similar sequences

    Reference:
        https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
    """
    set1 = set(seq1)
    set2 = set(seq2)
    try:
        return 2 * len(set1 & set2) / (len(set1) + len(set2))
    except ZeroDivisionError:
        return 0.0


[docs]def tversky(
    seq1: Iterable[str], seq2: Iterable[str], *, alpha: float = 1.0, beta: float = 1.0
) -> float:
    """
    Measure the similarity between two sequences of strings as sets
    using the (symmetric) Tversky index, which is a generalization of
    Jaccard (``alpha=0.5, beta=2.0``) and Sørensen-Dice (``alpha=0.5, beta=1.0``).

    Args:
        seq1
        seq2
        alpha
        beta

    Returns:
        Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0],
        where larger values correspond to more similar sequences

    Reference:
        https://en.wikipedia.org/wiki/Tversky_index
    """
    set1 = set(seq1)
    set2 = set(seq2)
    intersection = len(set1 & set2)
    set1_not_set2 = len(set1 - set2)
    set2_not_set1 = len(set2 - set1)
    a = min(set1_not_set2, set2_not_set1)
    b = max(set1_not_set2, set2_not_set1)
    try:
        return intersection / (intersection + (beta * (alpha * a + (1 - alpha) * b)))
    except ZeroDivisionError:
        return 0.0


[docs]def cosine(seq1: Iterable[str], seq2: Iterable[str]) -> float:
    """
    Measure the similarity between two sequences of strings as sets
    using the Otsuka-Ochiai variation of cosine similarity (which is equivalent
    to the usual formulation when values are binary).

    Args:
        seq1
        seq2

    Returns:
        Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0],
        where larger values correspond to more similar sequences

    Reference:
        https://en.wikipedia.org/wiki/Cosine_similarity#Otsuka-Ochiai_coefficient
    """
    set1 = set(seq1)
    set2 = set(seq2)
    try:
        return len(set1 & set2) / math.sqrt(len(set1) * len(set2))
    except ZeroDivisionError:
        return 0.0


[docs]def bag(seq1: Iterable[str], seq2: Iterable[str]) -> float:
    """
    Measure the similarity between two sequences of strings (*not* as sets)
    using the "bag distance" measure, which can be considered an approximation
    of edit distance.

    Args:
        seq1
        seq2

    Returns:
        Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0],
        where larger values correspond to more similar sequences

    Reference:
        Bartolini, Ilaria, Paolo Ciaccia, and Marco Patella. "String matching with
        metric trees using an approximate distance." International Symposium on String
        Processing and Information Retrieval. Springer, Berlin, Heidelberg, 2002.
    """
    bag1 = collections.Counter(seq1)
    bag2 = collections.Counter(seq2)
    max_diff = max(sum((bag1 - bag2).values()), sum((bag2 - bag1).values()))
    max_len = max(sum(bag1.values()), sum(bag2.values()))
    try:
        return 1.0 - (max_diff / max_len)
    except ZeroDivisionError:
        return 0.0
Source code for textacy.similarity.tokens

Navigation

Related Topics