Source code for textacy.io.csv

"""
:mod:`textacy.io.csv`: Functions for reading from and writing to disk records in CSV format,
where CSVs may be delimited not only by commas (the default) but tabs, pipes, and
other valid one-char delimiters.
"""
from __future__ import annotations

import csv
from typing import Any, Dict, Iterable, Iterator, Optional, Sequence, Type, Union

from .. import types
from . import utils as io_utils


[docs]def read_csv(
    filepath: types.PathLike,
    *,
    encoding: Optional[str] = None,
    fieldnames: Optional[str | Sequence[str]] = None,
    dialect: str | Type[csv.Dialect] = "excel",
    delimiter: str = ",",
    quoting: int = csv.QUOTE_NONNUMERIC,
) -> Iterable[list] | Iterable[dict]:
    """
    Read the contents of a CSV file at ``filepath``, streaming line-by-line,
    where each line is a list of strings and/or floats whose values
    are separated by ``delimiter``.

    Args:
        filepath: Path to file on disk from which data will be read.
        encoding: Name of the encoding used to decode or encode the data in ``filepath``.
        fieldnames: If specified, gives names for columns of values,
            which are used as keys in an ordered dictionary representation
            of each line's data. If 'infer', the first kB of data is analyzed
            to make a guess about whether the first row is a header of column
            names, and if so, those names are used as keys. If None, no column
            names are used, and each line is returned as a list of strings/floats.
        dialect: Grouping of formatting parameters that determine how
            the data is parsed when reading/writing. If 'infer', the first kB
            of data is analyzed to get a best guess for the correct dialect.
        delimiter: 1-character string used to separate fields in a row.
        quoting: Type of quoting to apply to field values. See:
            https://docs.python.org/3/library/csv.html#csv.QUOTE_NONNUMERIC

    Yields:
        List[obj]: Next row, whose elements are strings and/or floats.
        If ``fieldnames`` is None or 'infer' doesn't detect a header row.

        *or*

        Dict[str, obj]: Next row, as an ordered dictionary of (key, value) pairs,
        where keys are column names and values are the corresponding strings
        and/or floats. If ``fieldnames`` is a list of column names or 'infer'
        detects a header row.

    See Also:
        https://docs.python.org/3/library/csv.html#csv.reader
    """
    has_header = False
    with io_utils.open_sesame(filepath, mode="rt", encoding=encoding, newline="") as f:
        if dialect == "infer" or fieldnames == "infer":
            sniffer = csv.Sniffer()
            # add pipes to the list of preferred delimiters, and put spaces last
            sniffer.preferred = [",", "\t", "|", ";", ":", " "]
            # sample = "".join(f.readline() for _ in range(5))  # f.read(1024)
            sample = f.read(1024)
            if dialect == "infer":
                dialect = sniffer.sniff(sample)
            if fieldnames == "infer":
                has_header = sniffer.has_header(sample)
            f.seek(0)
        csv_reader: Union[csv.DictReader, Iterator]
        if has_header is True:
            csv_reader = csv.DictReader(
                f,
                fieldnames=None,
                dialect=dialect,
                delimiter=delimiter,
                quoting=quoting,
            )
        elif fieldnames:
            csv_reader = csv.DictReader(
                f,
                fieldnames=fieldnames,
                dialect=dialect,
                delimiter=delimiter,
                quoting=quoting,
            )
            first_row = next(csv_reader)
            # is the first row a header with same values as fieldnames?
            # if not, we should yield the row as usual
            if not all(key == value for key, value in first_row.items()):
                yield first_row
        else:
            csv_reader = csv.reader(
                f, dialect=dialect, delimiter=delimiter, quoting=quoting,
            )
        for row in csv_reader:
            yield row


[docs]def write_csv(
    data: Iterable[Dict[str, Any]] | Iterable[Iterable],
    filepath: types.PathLike,
    *,
    encoding: Optional[str] = None,
    make_dirs: bool = False,
    fieldnames: Optional[Sequence[str]] = None,
    dialect: str = "excel",
    delimiter: str = ",",
    quoting: int = csv.QUOTE_NONNUMERIC,
) -> None:
    """
    Write rows of ``data`` to disk at ``filepath``, where each row is an iterable
    or a dictionary of strings and/or numbers, written to one line with values
    separated by ``delimiter``.

    Args:
        data: If ``fieldnames`` is None, an iterable of iterables of strings
            and/or numbers to write to disk; for example::

                [['That was a great movie!', 0.9],
                 ['The movie was okay, I guess.', 0.2],
                 ['Worst. Movie. Ever.', -1.0]]

            If ``fieldnames`` is specified, an iterable of dictionaries with
            string and/or number values to write to disk; for example::

                [{'text': 'That was a great movie!', 'score': 0.9},
                 {'text': 'The movie was okay, I guess.', 'score': 0.2},
                 {'text': 'Worst. Movie. Ever.', 'score': -1.0}]

        filepath: Path to file on disk to which data will be written.
        encoding: Name of the encoding used to decode or encode the data in ``filepath``.
        make_dirs: If True, automatically create (sub)directories if not already present
            in order to write ``filepath``.
        fieldnames: Sequence of keys that identify the order in which
            values in each rows' dictionary is written to ``filepath``. These are
            included in ``filepath`` as a header row of column names.

            .. note:: Only specify this if ``data`` is an iterable of dictionaries.

        dialect: Grouping of formatting parameters that determine how
            the data is parsed when reading/writing.
        delimiter: 1-character string used to separate fields in a row.
        quoting: Type of quoting to apply to field values. See:
            https://docs.python.org/3/library/csv.html#csv.QUOTE_NONNUMERIC

    See Also:
        https://docs.python.org/3/library/csv.html#csv.writer
    """
    with io_utils.open_sesame(
        filepath, mode="wt", newline="", encoding=encoding, make_dirs=make_dirs
    ) as f:
        csv_writer: Union[csv.DictWriter, Any]
        if fieldnames:
            csv_writer = csv.DictWriter(
                f, fieldnames, dialect=dialect, delimiter=delimiter, quoting=quoting,
            )
            csv_writer.writeheader()
        else:
            csv_writer = csv.writer(
                f, dialect=dialect, delimiter=delimiter, quoting=quoting,
            )
        csv_writer.writerows(data)
Source code for textacy.io.csv

Navigation

Related Topics