Source code for textacy.similarity.sequences

"""
Sequence-based Metrics
----------------------

:mod:`textacy.similarity.sequences`: Normalized similarity metrics built on
sequence-based algorithms that identify and measure the subsequences common to each.
"""
import difflib
from typing import Sequence


[docs]def matching_subsequences_ratio(
    seq1: Sequence[str], seq2: Sequence[str], **kwargs
) -> float:
    """
    Measure the similarity between two sequences of strings by finding
    contiguous matching subsequences without any "junk" elements and normalizing
    by the total number of elements.

    Args:
        seq1
        seq2
        **kwargs
            isjunk: Optional[Callable[str], bool] = None
            autojunk: bool = True

    Returns:
        Similarity between ``seq1`` and ``seq2`` in the interval [0.0, 1.0],
        where larger values correspond to more similar sequences of strings

    Reference:
        https://docs.python.org/3/library/difflib.html#difflib.SequenceMatcher.ratio
    """
    return difflib.SequenceMatcher(a=seq1, b=seq2, **kwargs).ratio()
Source code for textacy.similarity.sequences

Navigation

Related Topics