Source code for textacy.similarity.edits

"""
Edit-based Metrics
------------------

:mod:`textacy.similarity.edits`: Normalized similarity metrics built on edit-based
algorithms that compute the number of operations (additions, subtractions, ...)
needed to transform one string into another.
"""
from __future__ import annotations

from typing import Optional

import sklearn.feature_extraction
import sklearn.metrics
from jellyfish import (
    hamming_distance as _hamming,
    levenshtein_distance as _levenshtein,
    jaro_similarity as _jaro_similarity,
)

from .. import constants


def _shortcut(str1: str, str2: str) -> Optional[float]:
    if not str1 and not str2:
        return 0.0
    elif str1 == str2:
        return 1.0
    else:
        return None


[docs]def hamming(str1: str, str2: str) -> float:
    """
    Compute the similarity between two strings using Hamming distance,
    which gives the number of characters at corresponding string indices that differ,
    including chars in the longer string that have no correspondents in the shorter.

    Args:
        str1
        str2

    Returns:
        Similarity between ``str1`` and ``str2`` in the interval [0.0, 1.0],
        where larger values correspond to more similar strings
    """
    shortcut = _shortcut(str1, str2)
    if shortcut:
        return shortcut

    distance = _hamming(str1, str2)
    max_len = max(len(str1), len(str2))
    try:
        return 1.0 - (distance / max_len)
    except ZeroDivisionError:
        return 0.0


[docs]def levenshtein(str1: str, str2: str) -> float:
    """
    Measure the similarity between two strings using Levenshtein distance,
    which gives the minimum number of character insertions, deletions, and substitutions
    needed to change one string into the other.

    Args:
        str1
        str2

    Returns:
        Similarity between ``str1`` and ``str2`` in the interval [0.0, 1.0],
        where larger values correspond to more similar strings
    """
    shortcut = _shortcut(str1, str2)
    if shortcut:
        return shortcut

    distance = _levenshtein(str1, str2)
    max_len = max(len(str1), len(str2))
    try:
        return 1.0 - (distance / max_len)
    except ZeroDivisionError:
        return 0.0


[docs]def jaro(str1: str, str2: str) -> float:
    """
    Measure the similarity between two strings using Jaro (*not* Jaro-Winkler) distance,
    which searches for common characters while taking transpositions and string lengths
    into account.

    Args:
        str1
        str2

    Returns:
        Similarity between ``str1`` and ``str2`` in the interval [0.0, 1.0],
        where larger values correspond to more similar strings
    """
    # NOTE: I usually try to avoid wrapping other packages' functionality in one-liners
    # but this is here for consistency with levenshtein and hamming,
    # which do extra work to get normalized similarity values
    shortcut = _shortcut(str1, str2)
    if shortcut:
        return shortcut

    return _jaro_similarity(str1, str2)


[docs]def character_ngrams(str1: str, str2: str) -> float:
    """
    Measure the similarity between two strings using a character ngrams similarity
    metric, in which strings are transformed into trigrams of alnum-only characters,
    vectorized and weighted by tf-idf, then compared by cosine similarity.

    Args:
        str1
        str2

    Returns:
        Similarity between ``str1`` and ``str2`` in the interval [0.0, 1.0],
        where larger values correspond to more similar strings

    Note:
        This method has been used in cross-lingual plagiarism detection and
        authorship attribution, and seems to work better on longer texts.
        At the very least, it is *slow* on shorter texts relative to the other
        similarity measures.
    """
    # TODO: figure out where this method falls -- not edit-based??
    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
        preprocessor=lambda s: " ".join(constants.RE_ALNUM.findall(s.lower())),
        analyzer="char",
        token_pattern="(?u)\\b\\w+\\b",
        ngram_range=(3, 3),
    )
    mat = vectorizer.fit_transform([str1, str2])
    return sklearn.metrics.pairwise.cosine_similarity(mat[0, :], mat[1, :]).item()
Source code for textacy.similarity.edits

Navigation

Related Topics