"""
:mod:`textacy.io.spacy`: Functions for reading from and writing to disk spacy documents
in either pickle or binary format. Be warned: Both formats have pros and cons.
"""
from __future__ import annotations
import pickle
from typing import Iterable, Optional
from spacy.tokens import Doc, DocBin
from .. import errors, spacier, types
from . import utils as io_utils
[docs]def read_spacy_docs(
filepath: types.PathLike,
*,
format: str = "binary",
lang: Optional[types.LangLike] = None,
) -> Iterable[Doc]:
"""
Read the contents of a file at ``filepath``, written in binary or pickle format.
Args:
filepath: Path to file on disk from which data will be read.
format ({"binary", "pickle"}): Format of the data that was written to disk.
If "binary", uses :class:`spacy.tokens.DocBin` to deserialie data;
if "pickle", uses python's stdlib ``pickle``.
.. warning:: Docs written in pickle format were saved all together
as a list, which means they're all loaded into memory at once
before streaming one by one. Mind your RAM usage, especially when
reading many docs!
lang: Language with which spaCy originally processed docs, represented as
the full name of or path on disk to the pipeline, or an already instantiated
pipeline instance.
Note that this is only required when ``format`` is "binary".
Yields:
Next deserialized document.
Raises:
ValueError: if format is not "binary" or "pickle", or if ``lang`` is None
when ``format="binary"``
"""
if format == "binary":
if lang is None:
raise ValueError(
"lang=None is invalid. When format='binary', a `spacy.Language` "
"(well, its associated `spacy.Vocab`) is required to deserialize "
"the binary data. Note that this should be the same language pipeline "
"used when processing the original docs!"
)
else:
lang = spacier.utils.resolve_langlike(lang)
docbin = DocBin().from_disk(filepath)
for doc in docbin.get_docs(lang.vocab):
yield doc
elif format == "pickle":
with io_utils.open_sesame(filepath, mode="rb") as f:
for spacy_doc in pickle.load(f):
yield spacy_doc
else:
raise ValueError(
errors.value_invalid_msg("format", format, {"binary", "pickle"})
)
[docs]def write_spacy_docs(
data: Doc | Iterable[Doc],
filepath: types.PathLike,
*,
make_dirs: bool = False,
format: str = "binary",
attrs: Optional[Iterable[str]] = None,
store_user_data: bool = False,
) -> None:
"""
Write one or more ``Doc`` s to disk at ``filepath`` in binary or pickle format.
Args:
data: A single ``Doc`` or a sequence of ``Doc`` s to write to disk.
filepath: Path to file on disk to which data will be written.
make_dirs: If True, automatically create (sub)directories
if not already present in order to write ``filepath``.
format ({"pickle", "binary"}): Format of the data written to disk.
If "binary", uses :class:`spacy.tokens.DocBin` to serialie data;
if "pickle", uses python's stdlib ``pickle``.
.. warning:: When writing docs in pickle format, all the docs in ``data``
must be saved as a list, which means they're all loaded into memory.
Mind your RAM usage, especially when writing many docs!
attrs: List of attributes to serialize if ``format`` is "binary". If None,
spaCy's default values are used; see here: https://spacy.io/api/docbin#init
store_user_data: If True, write :attr`Doc.user_data` and the values of custom
extension attributes to disk; otherwise, don't.
Raises:
ValueError: if format is not "binary" or "pickle"
"""
if isinstance(data, Doc):
data = [data]
if format == "binary":
kwargs = {"docs": data, "store_user_data": store_user_data}
if attrs is not None:
kwargs["attrs"] = list(attrs)
docbin = DocBin(**kwargs)
docbin.to_disk(filepath)
elif format == "pickle":
if store_user_data is False:
data = _clear_docs_user_data(data)
with io_utils.open_sesame(filepath, mode="wb", make_dirs=make_dirs) as f:
pickle.dump(list(data), f, protocol=-1)
else:
raise ValueError(
errors.value_invalid_msg("format", format, {"binary", "pickle"})
)
def _clear_docs_user_data(docs: Iterable[Doc]) -> Iterable[Doc]:
# TODO: figure out if/how to clear out custom doc extension values
for doc in docs:
doc.user_data.clear()
yield doc