Skip to content

BaseTable

clifpy.tables.base_table.BaseTable

BaseTable(
    data_directory,
    filetype,
    timezone,
    output_directory=None,
    data=None,
)

Base class for all pyCLIF table classes.

Provides common functionality for loading data, running validations, and generating reports. All table-specific classes should inherit from this.

Attributes:

Name Type Description
data_directory str

Path to the directory containing data files

filetype str

Type of data file (csv, parquet, etc.)

timezone str

Timezone for datetime columns

output_directory str

Directory for saving output files and logs

table_name str

Name of the table (from class name)

df DataFrame

The loaded data

schema dict

The YAML schema for this table

errors List[dict]

Validation errors from last validation run

logger Logger

Logger for this table

Initialize the BaseTable.

Parameters:

Name Type Description Default
data_directory str

Path to the directory containing data files

required
filetype str

Type of data file (csv, parquet, etc.)

required
timezone str

Timezone for datetime columns

required
output_directory str

Directory for saving output files and logs. If not provided, creates an 'output' directory in the current working directory.

None
data DataFrame

Pre-loaded data to use instead of loading from file

None
Source code in clifpy/tables/base_table.py
def __init__(
    self, 
    data_directory: str,
    filetype: str,
    timezone: str,
    output_directory: Optional[str] = None,
    data: Optional[pd.DataFrame] = None
):
    """
    Initialize the BaseTable.

    Parameters:
        data_directory (str): Path to the directory containing data files
        filetype (str): Type of data file (csv, parquet, etc.)
        timezone (str): Timezone for datetime columns
        output_directory (str, optional): Directory for saving output files and logs.
            If not provided, creates an 'output' directory in the current working directory.
        data (pd.DataFrame, optional): Pre-loaded data to use instead of loading from file
    """
    # Store configuration
    self.data_directory = data_directory
    self.filetype = filetype
    self.timezone = timezone

    # Set output directory
    if output_directory is None:
        output_directory = os.path.join(os.getcwd(), 'output')
    self.output_directory = output_directory
    os.makedirs(self.output_directory, exist_ok=True)

    # Derive snake_case table name from PascalCase class name
    # Example: Adt -> adt, RespiratorySupport -> respiratory_support
    self.table_name = ''.join(['_' + c.lower() if c.isupper() else c for c in self.__class__.__name__]).lstrip('_')

    # Initialize data and validation state
    self.df: Optional[pd.DataFrame] = data
    self.errors: List[Dict[str, Any]] = []
    self.schema: Optional[Dict[str, Any]] = None
    self._validated: bool = False

    # Setup logging
    self._setup_logging()

    # Load schema
    self._load_schema()

from_file classmethod

from_file(
    data_directory,
    filetype,
    timezone="UTC",
    output_directory=None,
    sample_size=None,
    columns=None,
    filters=None,
)

Load data from file and create a table instance.

Parameters:

Name Type Description Default
data_directory str

Path to the directory containing data files

required
filetype str

Type of data file (csv, parquet, etc.)

required
timezone str

Timezone for datetime columns (default: UTC)

'UTC'
output_directory str

Directory for saving output files and logs

None
sample_size int

Number of rows to load

None
columns List[str]

Specific columns to load

None
filters Dict

Filters to apply when loading

None

Returns:

Type Description

Instance of the table class with loaded data

Source code in clifpy/tables/base_table.py
@classmethod
def from_file(
    cls, 
    data_directory: str,
    filetype: str,
    timezone: str = "UTC",
    output_directory: Optional[str] = None,
    sample_size: Optional[int] = None,
    columns: Optional[List[str]] = None,
    filters: Optional[Dict[str, Any]] = None
):
    """
    Load data from file and create a table instance.

    Parameters:
        data_directory (str): Path to the directory containing data files
        filetype (str): Type of data file (csv, parquet, etc.)
        timezone (str): Timezone for datetime columns (default: UTC)
        output_directory (str, optional): Directory for saving output files and logs
        sample_size (int, optional): Number of rows to load
        columns (List[str], optional): Specific columns to load
        filters (Dict, optional): Filters to apply when loading

    Returns:
        Instance of the table class with loaded data
    """
    # Derive snake_case table name from PascalCase class name
    table_name = ''.join(['_' + c.lower() if c.isupper() else c for c in cls.__name__]).lstrip('_')

    # Load data using existing io utility
    data = load_data(
        table_name, 
        data_directory, 
        filetype, 
        sample_size=sample_size,
        columns=columns,
        filters=filters,
        site_tz=timezone
    )

    # Create instance with loaded data
    return cls(
        data_directory=data_directory,
        filetype=filetype,
        timezone=timezone,
        output_directory=output_directory,
        data=data
    )

get_summary

get_summary()

Get a summary of the table data.

Returns:

Name Type Description
dict Dict[str, Any]

Summary statistics and information about the table

Source code in clifpy/tables/base_table.py
def get_summary(self) -> Dict[str, Any]:
    """
    Get a summary of the table data.

    Returns:
        dict: Summary statistics and information about the table
    """
    if self.df is None:
        return {"status": "No data loaded"}

    summary = {
        "table_name": self.table_name,
        "num_rows": len(self.df),
        "num_columns": len(self.df.columns),
        "columns": list(self.df.columns),
        "memory_usage_mb": self.df.memory_usage(deep=True).sum() / 1024 / 1024,
        "validation_run": self._validated,
        "validation_errors": len(self.errors) if self._validated else None,
        "is_valid": self.isvalid()
    }

    # Add basic statistics for numeric columns
    numeric_cols = self.df.select_dtypes(include=['number']).columns
    if len(numeric_cols) > 0:
        summary["numeric_columns"] = list(numeric_cols)
        summary["numeric_stats"] = self.df[numeric_cols].describe().to_dict()

    # Add missing data summary
    missing_counts = self.df.isnull().sum()
    if missing_counts.any():
        summary["missing_data"] = missing_counts[missing_counts > 0].to_dict()

    return summary

isvalid

isvalid()

Check if the data is valid based on the last validation run.

Returns:

Name Type Description
bool bool

True if validation has been run and no errors were found, False if validation found errors or hasn't been run yet

Source code in clifpy/tables/base_table.py
def isvalid(self) -> bool:
    """
    Check if the data is valid based on the last validation run.

    Returns:
        bool: True if validation has been run and no errors were found,
              False if validation found errors or hasn't been run yet
    """
    if not self._validated:
        print("Validation has not been run yet. Please call validate() first.")
        return False
    return not self.errors

save_summary

save_summary()

Save table summary to a JSON file.

Source code in clifpy/tables/base_table.py
def save_summary(self):
    """Save table summary to a JSON file."""
    try:
        import json

        summary = self.get_summary()

        # Save to JSON
        summary_file = os.path.join(
            self.output_directory,
            f'summary_{self.table_name}.json'
        )

        with open(summary_file, 'w') as f:
            json.dump(summary, f, indent=2, default=str)

        self.logger.info(f"Saved summary to {summary_file}")

    except Exception as e:
        self.logger.error(f"Error saving summary: {str(e)}")

validate

validate()

Run comprehensive validation on the data.

This method runs all validation checks including: - Schema validation (required columns, data types, categories) - Missing data analysis - Duplicate checking - Statistical analysis - Table-specific validations (if overridden in child class)

Source code in clifpy/tables/base_table.py
def validate(self):
    """
    Run comprehensive validation on the data.

    This method runs all validation checks including:
    - Schema validation (required columns, data types, categories)
    - Missing data analysis
    - Duplicate checking
    - Statistical analysis
    - Table-specific validations (if overridden in child class)
    """
    if self.df is None:
        self.logger.warning("No dataframe to validate")
        print("No dataframe to validate.")
        return

    self.logger.info("Starting validation")
    self.errors = []
    self._validated = True

    try:
        # Run basic schema validation
        if self.schema:
            self.logger.info("Running schema validation")
            schema_errors = validator.validate_dataframe(self.df, self.schema)
            self.errors.extend(schema_errors)

            if schema_errors:
                self.logger.warning(f"Schema validation found {len(schema_errors)} errors")
            else:
                self.logger.info("Schema validation passed")

        # Run enhanced validations (these will be implemented in Phase 3)
        self._run_enhanced_validations()

        # Run table-specific validations (can be overridden in child classes)
        self._run_table_specific_validations()

        # Log validation results
        if not self.errors:
            self.logger.info("Validation completed successfully")
            print("Validation completed successfully.")
        else:
            self.logger.warning(f"Validation completed with {len(self.errors)} error(s)")
            print(f"Validation completed with {len(self.errors)} error(s). See `errors` attribute.")

            # Save errors to CSV
            self._save_validation_errors()

    except Exception as e:
        self.logger.error(f"Error during validation: {str(e)}")
        self.errors.append({
            "type": "validation_error",
            "message": str(e)
        })