"""
Sparse Vectors
--------------
:mod:`textacy.representations.sparse_vec`: Transform a collection of tokenized docs
into a doc-term matrix of shape (# docs, # unique terms) or a group-term matrix
of shape (# unique groups, # unique terms), with various ways to filter/limit
included terms and flexible weighting/normalization schemes for their values.
Intended primarily as a simpler- and higher-level API for sparse vectorization of docs.
"""
from typing import Dict, Iterable, Optional, Tuple
import scipy.sparse as sp
from . import vectorizers
[docs]def build_doc_term_matrix(
tokenized_docs: Iterable[Iterable[str]],
*,
tf_type: str = "linear", # Literal["linear", "sqrt", "log", "binary"]
idf_type: Optional[str] = None, # Optional[Literal["standard", "smooth", "bm25"]]
dl_type: Optional[str] = None, # Optional[Literal["linear", "sqrt", "log"]]
**kwargs
) -> Tuple[sp.csr_matrix, Dict[str, int]]:
"""
Transform one or more tokenized documents into a document-term matrix
of shape (# docs, # unique terms), with flexible weighting/normalization of values.
Args:
tokenized_docs: A sequence of tokenized documents, where each is a sequence
of term strings. For example::
>>> ([tok.lemma_ for tok in spacy_doc]
... for spacy_doc in spacy_docs)
>>> ((ne.text for ne in extract.entities(doc))
... for doc in corpus)
tf_type: Type of term frequency (tf) to use for weights' local component:
- "linear": tf (tfs are already linear, so left as-is)
- "sqrt": tf => sqrt(tf)
- "log": tf => log(tf) + 1
- "binary": tf => 1
idf_type: Type of inverse document frequency (idf) to use for weights'
global component:
- "standard": idf = log(n_docs / df) + 1.0
- "smooth": idf = log(n_docs + 1 / df + 1) + 1.0, i.e. 1 is added
to all document frequencies, as if a single document containing
every unique term was added to the corpus.
- "bm25": idf = log((n_docs - df + 0.5) / (df + 0.5)), which is
a form commonly used in information retrieval that allows for
very common terms to receive negative weights.
- None: no global weighting is applied to local term weights.
dl_type: Type of document-length scaling to use for weights'
normalization component:
- "linear": dl (dls are already linear, so left as-is)
- "sqrt": dl => sqrt(dl)
- "log": dl => log(dl)
- None: no normalization is applied to local(*global?) weights
**kwargs: Passed directly into vectorizer class
Returns:
Document-term matrix as a sparse row matrix, and
the corresponding mapping of term strings to integer ids (column indexes).
Note:
If you need to transform other sequences of tokenized documents in the same way,
or if you need more access to the underlying vectorization process,
consider using :class:`textacy.representations.vectorizers.Vectorizer` directly.
See Also:
- :class:`textacy.representations.vectorizers.Vectorizer`
- :class:`scipy.sparse.csr_matrix`
Reference:
https://en.wikipedia.org/wiki/Document-term_matrix
"""
vectorizer = vectorizers.Vectorizer(
tf_type=tf_type, idf_type=idf_type, dl_type=dl_type, **kwargs
)
doc_term_matrix = vectorizer.fit_transform(tokenized_docs)
return (doc_term_matrix, vectorizer.vocabulary_terms)
[docs]def build_grp_term_matrix(
tokenized_docs: Iterable[Iterable[str]],
grps: Iterable[str],
*,
tf_type: str = "linear", # Literal["linear", "sqrt", "log", "binary"]
idf_type: Optional[str] = None, # Optional[Literal["standard", "smooth", "bm25"]]
dl_type: Optional[str] = None, # Optional[Literal["linear", "sqrt", "log"]]
**kwargs
) -> sp.csr_matrix:
"""
Transform one or more tokenized documents into a group-term matrix
of shape (# unique groups, # unique terms),
with flexible weighting/normalization of values.
This is an extension of typical document-term matrix vectorization, where
terms are grouped by the documents in which they co-occur. It allows for
customized grouping, such as by a shared author or publication year, that
may span multiple documents, without forcing users to merge those documents
themselves.
Args:
tokenized_docs: A sequence of tokenized documents, where each is a sequence
of term strings. For example::
>>> ([tok.lemma_ for tok in spacy_doc]
... for spacy_doc in spacy_docs)
>>> ((ne.text for ne in extract.entities(doc))
... for doc in corpus)
grps: Sequence of group names by which the terms in ``tokenized_docs``
are aggregated, where the first item in ``grps`` corresponds to
the first item in ``tokenized_docs``, and so on.
tf_type: Type of term frequency (tf) to use for weights' local component:
- "linear": tf (tfs are already linear, so left as-is)
- "sqrt": tf => sqrt(tf)
- "log": tf => log(tf) + 1
- "binary": tf => 1
idf_type: Type of inverse document frequency (idf) to use for weights'
global component:
- "standard": idf = log(n_docs / df) + 1.0
- "smooth": idf = log(n_docs + 1 / df + 1) + 1.0, i.e. 1 is added
to all document frequencies, as if a single document containing
every unique term was added to the corpus.
- "bm25": idf = log((n_docs - df + 0.5) / (df + 0.5)), which is
a form commonly used in information retrieval that allows for
very common terms to receive negative weights.
- None: no global weighting is applied to local term weights.
dl_type: Type of document-length scaling to use for weights'
normalization component:
- "linear": dl (dls are already linear, so left as-is)
- "sqrt": dl => sqrt(dl)
- "log": dl => log(dl)
- None: no normalization is applied to local(*global?) weights
**kwargs: Passed directly into vectorizer class
Returns:
Group-term matrix as a sparse row matrix, and
the corresponding mapping of term strings to integer ids (column indexes), and
the corresponding mapping of group strings to integer ids (row indexes).
Note:
If you need to transform other sequences of tokenized documents in the same way,
or if you need more access to the underlying vectorization process, consider
using :class:`textacy.representations.vectorizers.GroupVectorizer` directly.
See Also:
- :class:`textacy.representations.vectorizers.GroupVectorizer`
- :class:`scipy.sparse.csr_matrix`
Reference:
https://en.wikipedia.org/wiki/Document-term_matrix
"""
vectorizer = vectorizers.GroupVectorizer(
tf_type=tf_type, idf_type=idf_type, dl_type=dl_type, **kwargs
)
grp_term_matrix = vectorizer.fit_transform(tokenized_docs, grps)
return (grp_term_matrix, vectorizer.vocabulary_terms, vectorizer.vocabulary_grps)