Source code for textacy.io.http

"""
:mod:`textacy.io.http`: Functions for reading data from URLs via streaming HTTP requests
and either reading it into memory or writing it directly to disk.
"""
from __future__ import annotations

import logging
from contextlib import closing
from typing import Iterable, Optional, Tuple

import requests
from tqdm import tqdm

from .. import types, utils
from . import utils as io_utils

LOGGER = logging.getLogger(__name__)


[docs]def read_http_stream(
    url: str,
    *,
    lines: bool = False,
    decode_unicode: bool = False,
    chunk_size: int = 1024,
    auth: Optional[Tuple[str, str]] = None,
) -> Iterable[str] | Iterable[bytes]:
    """
    Read data from ``url`` in a stream, either all at once or line-by-line.

    Args:
        url: URL to which a GET request is made for data.
        lines: If False, yield all of the data at once; otherwise, yield data line-by-line.
        decode_unicode: If True, yield data as unicode, where the encoding
            is taken from the HTTP response headers; otherwise, yield bytes.
        chunk_size: Number of bytes read into memory per chunk.
            Because decoding may occur, this is not necessarily the length of each chunk.
        auth: (username, password) pair for simple HTTP authentication required (if at all)
            to access the data at ``url``.

            .. seealso:: http://docs.python-requests.org/en/master/user/authentication/

    Yields:
        If ``lines`` is True, the next line in the response data,
        which is bytes if ``decode_unicode`` is False or unicode otherwise.
        If ``lines`` is False, yields the full response content, either as bytes
        or unicode.
    """
    # always close the connection
    with closing(requests.get(url, stream=True, auth=auth)) as r:
        # set fallback encoding if unable to infer from headers
        if r.encoding is None:
            r.encoding = "utf-8"
        if lines is False:
            if decode_unicode is True:
                yield r.text
            else:
                yield r.content
        else:
            lines_ = r.iter_lines(chunk_size=chunk_size, decode_unicode=decode_unicode)
            for line in lines_:
                if line:
                    yield line


[docs]def write_http_stream(
    url: str,
    filepath: types.PathLike,
    *,
    mode: str = "wt",
    encoding: Optional[str] = None,
    make_dirs: bool = False,
    chunk_size: int = 1024,
    auth: Optional[Tuple[str, str]] = None,
) -> None:
    """
    Download data from ``url`` in a stream, and write successive chunks
    to disk at ``filepath``.

    Args:
        url: URL to which a GET request is made for data.
        filepath: Path to file on disk to which data will be written.
        mode: Mode with which ``filepath`` is opened.
        encoding: Name of the encoding used to decode or encode the data
            in ``filepath``. Only applicable in text mode.

            .. note:: The encoding on the HTTP response is inferred from its
               headers, or set to 'utf-8' as a fall-back in the case that no
               encoding is detected. It is *not* set by ``encoding``.

        make_dirs: If True, automatically create (sub)directories if
            not already present in order to write ``filepath``.
        chunk_size: Number of bytes read into memory per chunk.
            Because decoding may occur, this is not necessarily the length of each chunk.
        auth: (username, password) pair for simple HTTP authentication required (if at all)
            to access the data at ``url``.

            .. seealso:: http://docs.python-requests.org/en/master/user/authentication/
    """
    decode_unicode = True if "t" in mode else False
    filepath = utils.to_path(filepath).resolve()
    if make_dirs is True:
        io_utils._make_dirs(filepath, mode)
    # use `closing` to ensure connection and progress bar *always* close
    with closing(requests.get(url, stream=True, auth=auth)) as r:
        LOGGER.info("downloading data from %s ...", url)
        # set fallback encoding if unable to infer from headers
        if r.encoding is None:
            r.encoding = "utf-8"
        total = int(r.headers.get("content-length", 0))
        with closing(tqdm(unit="B", unit_scale=True, total=total)) as pbar:
            with filepath.open(mode=mode, encoding=encoding) as f:
                chunks = r.iter_content(
                    chunk_size=chunk_size, decode_unicode=decode_unicode
                )
                for chunk in chunks:
                    # needed (?) to filter out "keep-alive" new chunks
                    if chunk:
                        pbar.update(len(chunk))
                        f.write(chunk)
Source code for textacy.io.http

Navigation

Related Topics