Source code for sorcha.readers.HDF5Reader

import pandas as pd

from sorcha.readers.ObjectDataReader import ObjectDataReader



[docs]
class HDF5DataReader(ObjectDataReader):
    """A class to read in object data files stored as HDF5 files."""

    def __init__(self, filename, **kwargs):
        """A class for reading the object data from an HDF5 file.

        Parameters
        -----------
        filename : string
            location/name of the data file.
        """
        super().__init__(**kwargs)

[docs]
        self.filename = filename


        # A table holding just the object ID for each row. Only populated
        # if we try to read data for specific object IDs.

[docs]
        self.obj_id_table = None



[docs]
    def get_reader_info(self):
        """Return a string identifying the current reader name
        and input information (for logging and output).

        Returns
        --------
        name : string
            The reader information.
        """
        return f"HDF5DataReader:{self.filename}"



[docs]
    def _read_rows_internal(self, block_start=0, block_size=None, **kwargs):
        """Reads in a set number of rows from the input.

        Parameters
        -----------
        block_start : integer, optional
            The 0-indexed row number from which
            to start reading the data. For example in a CSV file
            block_start=2 would skip the first two lines after the header
            and return data starting on row=2. Default=0

        block_size : integer, optional
            the number of rows to read in.
            Use block_size=None to read in all available data.
            Default = None

        **kwargs : dictionary, optional
            Extra arguments

        Returns
        -----------
        res_df  : pandas dataframe
            Dataframe of the object data.
        """
        if block_size is None:
            res_df = pd.read_hdf(
                self.filename,
                start=block_start,
            )
        else:
            res_df = pd.read_hdf(
                self.filename,
                start=block_start,
                stop=block_start + block_size,
            )
        return res_df



[docs]
    def _build_id_map(self):
        """Builds a table of just the object IDs"""
        if self.obj_id_table is not None:
            return
        self.obj_id_table = pd.read_hdf(self.filename, columns=["ObjID"])
        self.obj_id_table = self._validate_object_id_column(self.obj_id_table)



[docs]
    def _read_objects_internal(self, obj_ids, **kwargs):
        """Read in a chunk of data for given object IDs.

        Parameters
        -----------
        obj_ids : list
            A list of object IDs to use.

        **kwargs : dictionary, optional
            Extra arguments

        Returns
        -----------
        res_df : Pandas dataframe
            The dataframe for the object data.
        """
        self._build_id_map()
        row_match = self.obj_id_table["ObjID"].isin(obj_ids)
        match_inds = self.obj_id_table[row_match].index
        res_df = pd.read_hdf(self.filename, where="index=match_inds")  # noqa: F841
        return res_df



[docs]
    def _process_and_validate_input_table(self, input_table, **kwargs):
        """Perform any input-specific processing and validation on the input table.
        Modifies the input dataframe in place.

        Notes
        ------
        The base implementation includes filtering that is common to most
        input types. Subclasses should call super.process_and_validate()
        to ensure that the ancestor’s validation is also applied.

        Parameters
        -----------
        input_table : pandas dataframe
            A loaded table.

        **kwargs : dictionary, optional
            Extra arguments

        Returns
        -----------
        input_table : pandas dataframe
            Returns the input dataframe modified in-place.
        """
        # Perform the parent class's validation (checking object ID column).
        input_table = super()._process_and_validate_input_table(input_table, **kwargs)

        return input_table