Source code for parq_tools.parq_profile

"""
parq_profile.py

Utilities for profiling Parquet files and generating HTML reports using ydata-profiling, with support for notebook and browser display.

Main API:

- ParquetProfileReport: Class for generating, saving, and displaying profile reports for Parquet files.
"""
import json
from pathlib import Path
from typing import Iterator, Optional, List, Union
import pandas as pd
import pyarrow.parquet as pq

from ydata_profiling import ProfileReport

from parq_tools.utils import atomic_output_file
from parq_tools.utils.metadata_utils import get_table_metadata, get_column_metadata, get_pandas_metadata
from parq_tools.utils.profile_utils import ColumnarProfileReport, ProfileMetadata


[docs] def parquet_column_generator(parquet_path: Union[str, Path], columns: Optional[List[str]] = None) -> Iterator[pd.Series]: """ Yields columns from a Parquet file as pandas Series. Args: parquet_path (str or Path): Path to the Parquet file. columns (List[str], optional): List of column names to yield. If None, yields all columns. Yields: pd.Series: Each column as a pandas Series. """ pq_file = pq.ParquetFile(str(parquet_path)) pandas_metadata = get_pandas_metadata(pq_file) if pandas_metadata: index_columns = pandas_metadata.get('index_columns', []) else: index_columns = [] all_columns = columns or pq_file.schema.names for col in all_columns: if col not in pq_file.schema.names: raise ValueError(f"Column '{col}' not found in Parquet file.") if col in index_columns: series = pq_file.read(columns=[col]).to_pandas().reset_index()[col] else: series = pq_file.read(columns=[col]).to_pandas()[col] yield series
[docs] class ParquetProfileReport: """For ydata-profiler reports on large parquet files. Useful for profiling large Parquet files without loading them entirely into memory. This class supports both native profiling (without chunking) and columnar profiling (with chunking). """
[docs] def __init__(self, parquet_path: Union[str, Path], columns: Optional[List[str]] = None, batch_size: Optional[int] = 1, # Number of columns to process in each batch show_progress: bool = True, title: str = "Parquet Profile Report", dataset_metadata: Optional[Union[dict, ProfileMetadata]] = None, column_descriptions: Optional[dict[str, str]] = None) -> None: """ Initialize the ParquetProfileReport. Args: parquet_path: Path to the Parquet file to profile. columns: List of column names to include in the profile. If None, all columns are used. batch_size: Optional[int]: Number of columns to process in each batch. If None, processes all columns at once. show_progress: bool: If True, displays a progress bar during profiling. title: Title of the report. dataset_metadata: Optional[Union[dict, ProfileMetadata]]: Metadata for the dataset. Will over-ride any metadata in the Parquet file. column_descriptions: Optional[dict[str, str]]: Column descriptions for the dataset. Will over-ride any descriptions in the Parquet file. """ self.parquet_path = parquet_path self.batch_size = batch_size self.show_progress = show_progress self.title = title self.report: Optional[ProfileReport] = None metadata = dataset_metadata if isinstance(dataset_metadata, ProfileMetadata) else ProfileMetadata.from_dict( dataset_metadata) if dataset_metadata else None self.dataset_metadata = metadata self.column_descriptions = column_descriptions pq_file = pq.ParquetFile(str(self.parquet_path)) self.columns = pq_file.schema.names if columns is None else columns if not self.dataset_metadata: # If no metadata is provided, use the Parquet file metadata table_meta: dict = get_table_metadata(pq_file) self.dataset_metadata = ProfileMetadata.from_dict(table_meta) if pq_file.metadata else None if self.column_descriptions is None: # If no column descriptions are provided, use the Parquet file metadata column_descriptions = get_column_metadata(pq_file) column_descriptions = {col: desc.get("description", "") for col, desc in column_descriptions.items() if col in self.columns} self.column_descriptions = column_descriptions
[docs] def profile(self) -> 'ParquetProfileReport': """ Profiles the Parquet file """ if self.batch_size is None: # Native ydata profiling (no chunking) df = pd.read_parquet(self.parquet_path, columns=self.columns) self.report = ProfileReport(df, minimal=True, explorative=False, progress_bar=False, title=self.title, dataset=self.dataset_metadata.to_dict(), variables=self.column_descriptions) else: # Columnar profiling gen = parquet_column_generator(self.parquet_path, columns=self.columns) report = ColumnarProfileReport( column_generator=gen, column_count=len(self.columns), batch_size=self.batch_size, show_progress=self.show_progress, title=self.title, dataset_metadata=self.dataset_metadata, column_descriptions=self.column_descriptions) report.profile() self.report = report.report return self
[docs] def to_html(self) -> str: """The HTML representation of the profile report.""" if self.report is None: raise RuntimeError("No report generated. Call profile() first.") return self.report.to_html()
[docs] def save_html(self, output_html: Path) -> None: """ Save the profile report to a HTML file.""" with atomic_output_file(output_html) as tmp_path: tmp_path.write_text(self.to_html(), encoding="utf-8")
[docs] def show(self, notebook: bool = False): """Display the profile report in a notebook or open in a browser. Args: notebook (bool): If True, display in Jupyter notebook. If False, open in browser. """ if notebook: self.report.to_notebook_iframe() else: import tempfile, webbrowser tmp = tempfile.NamedTemporaryFile(suffix=".html", delete=False) tmp.write(self.to_html().encode("utf-8")) tmp.close() webbrowser.open_new_tab(f"file://{tmp.name}")