Source code for textacy.representations.sparse_vec

"""
Sparse Vectors
--------------

:mod:`textacy.representations.sparse_vec`: Transform a collection of tokenized docs
into a doc-term matrix of shape (# docs, # unique terms) or a group-term matrix
of shape (# unique groups, # unique terms), with various ways to filter/limit
included terms and flexible weighting/normalization schemes for their values.

Intended primarily as a simpler- and higher-level API for sparse vectorization of docs.
"""
from typing import Dict, Iterable, Optional, Tuple

import scipy.sparse as sp

from . import vectorizers


[docs]def build_doc_term_matrix( tokenized_docs: Iterable[Iterable[str]], *, tf_type: str = "linear", # Literal["linear", "sqrt", "log", "binary"] idf_type: Optional[str] = None, # Optional[Literal["standard", "smooth", "bm25"]] dl_type: Optional[str] = None, # Optional[Literal["linear", "sqrt", "log"]] **kwargs ) -> Tuple[sp.csr_matrix, Dict[str, int]]: """ Transform one or more tokenized documents into a document-term matrix of shape (# docs, # unique terms), with flexible weighting/normalization of values. Args: tokenized_docs: A sequence of tokenized documents, where each is a sequence of term strings. For example:: >>> ([tok.lemma_ for tok in spacy_doc] ... for spacy_doc in spacy_docs) >>> ((ne.text for ne in extract.entities(doc)) ... for doc in corpus) tf_type: Type of term frequency (tf) to use for weights' local component: - "linear": tf (tfs are already linear, so left as-is) - "sqrt": tf => sqrt(tf) - "log": tf => log(tf) + 1 - "binary": tf => 1 idf_type: Type of inverse document frequency (idf) to use for weights' global component: - "standard": idf = log(n_docs / df) + 1.0 - "smooth": idf = log(n_docs + 1 / df + 1) + 1.0, i.e. 1 is added to all document frequencies, as if a single document containing every unique term was added to the corpus. - "bm25": idf = log((n_docs - df + 0.5) / (df + 0.5)), which is a form commonly used in information retrieval that allows for very common terms to receive negative weights. - None: no global weighting is applied to local term weights. dl_type: Type of document-length scaling to use for weights' normalization component: - "linear": dl (dls are already linear, so left as-is) - "sqrt": dl => sqrt(dl) - "log": dl => log(dl) - None: no normalization is applied to local(*global?) weights **kwargs: Passed directly into vectorizer class Returns: Document-term matrix as a sparse row matrix, and the corresponding mapping of term strings to integer ids (column indexes). Note: If you need to transform other sequences of tokenized documents in the same way, or if you need more access to the underlying vectorization process, consider using :class:`textacy.representations.vectorizers.Vectorizer` directly. See Also: - :class:`textacy.representations.vectorizers.Vectorizer` - :class:`scipy.sparse.csr_matrix` Reference: https://en.wikipedia.org/wiki/Document-term_matrix """ vectorizer = vectorizers.Vectorizer( tf_type=tf_type, idf_type=idf_type, dl_type=dl_type, **kwargs ) doc_term_matrix = vectorizer.fit_transform(tokenized_docs) return (doc_term_matrix, vectorizer.vocabulary_terms)
[docs]def build_grp_term_matrix( tokenized_docs: Iterable[Iterable[str]], grps: Iterable[str], *, tf_type: str = "linear", # Literal["linear", "sqrt", "log", "binary"] idf_type: Optional[str] = None, # Optional[Literal["standard", "smooth", "bm25"]] dl_type: Optional[str] = None, # Optional[Literal["linear", "sqrt", "log"]] **kwargs ) -> sp.csr_matrix: """ Transform one or more tokenized documents into a group-term matrix of shape (# unique groups, # unique terms), with flexible weighting/normalization of values. This is an extension of typical document-term matrix vectorization, where terms are grouped by the documents in which they co-occur. It allows for customized grouping, such as by a shared author or publication year, that may span multiple documents, without forcing users to merge those documents themselves. Args: tokenized_docs: A sequence of tokenized documents, where each is a sequence of term strings. For example:: >>> ([tok.lemma_ for tok in spacy_doc] ... for spacy_doc in spacy_docs) >>> ((ne.text for ne in extract.entities(doc)) ... for doc in corpus) grps: Sequence of group names by which the terms in ``tokenized_docs`` are aggregated, where the first item in ``grps`` corresponds to the first item in ``tokenized_docs``, and so on. tf_type: Type of term frequency (tf) to use for weights' local component: - "linear": tf (tfs are already linear, so left as-is) - "sqrt": tf => sqrt(tf) - "log": tf => log(tf) + 1 - "binary": tf => 1 idf_type: Type of inverse document frequency (idf) to use for weights' global component: - "standard": idf = log(n_docs / df) + 1.0 - "smooth": idf = log(n_docs + 1 / df + 1) + 1.0, i.e. 1 is added to all document frequencies, as if a single document containing every unique term was added to the corpus. - "bm25": idf = log((n_docs - df + 0.5) / (df + 0.5)), which is a form commonly used in information retrieval that allows for very common terms to receive negative weights. - None: no global weighting is applied to local term weights. dl_type: Type of document-length scaling to use for weights' normalization component: - "linear": dl (dls are already linear, so left as-is) - "sqrt": dl => sqrt(dl) - "log": dl => log(dl) - None: no normalization is applied to local(*global?) weights **kwargs: Passed directly into vectorizer class Returns: Group-term matrix as a sparse row matrix, and the corresponding mapping of term strings to integer ids (column indexes), and the corresponding mapping of group strings to integer ids (row indexes). Note: If you need to transform other sequences of tokenized documents in the same way, or if you need more access to the underlying vectorization process, consider using :class:`textacy.representations.vectorizers.GroupVectorizer` directly. See Also: - :class:`textacy.representations.vectorizers.GroupVectorizer` - :class:`scipy.sparse.csr_matrix` Reference: https://en.wikipedia.org/wiki/Document-term_matrix """ vectorizer = vectorizers.GroupVectorizer( tf_type=tf_type, idf_type=idf_type, dl_type=dl_type, **kwargs ) grp_term_matrix = vectorizer.fit_transform(tokenized_docs, grps) return (grp_term_matrix, vectorizer.vocabulary_terms, vectorizer.vocabulary_grps)