Renaming and Metadata

A simple example to demonstrate how to rename columns in a parquet file. Additionally, we can update the metadata in the file - in this case we add column descriptions.

import json
import tempfile

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path

from parq_tools import rename_and_update_metadata

Create a Parquet file

Create a temporary parquet file for demonstration

def create_parquet_file(file_path: Path):
    # Define the dataset
    data = {
        "x": range(1, 11),  # Index column
        "y": range(11, 21),  # Index column
        "z": range(21, 31),  # Index column
        "a": [f"val{i}" for i in range(1, 11)],  # Supplementary column
        "b": [i * 2 for i in range(1, 11)],  # Supplementary column
        "c": [i % 3 for i in range(1, 11)],  # Supplementary column
    }

    # Create a DataFrame
    df = pa.Table.from_pydict(data)

    # Write the DataFrame to a Parquet file
    pq.write_table(df, file_path)


parquet_file_path = Path(tempfile.gettempdir()) / "example_data.parquet"
create_parquet_file(parquet_file_path)

View the file as a DataFrame

df = pd.read_parquet(parquet_file_path)
df

	x	y	z	a	b	c
0	1	11	21	val1	2	1
1	2	12	22	val2	4	2
2	3	13	23	val3	6	0
3	4	14	24	val4	8	1
4	5	15	25	val5	10	2
5	6	16	26	val6	12	0
6	7	17	27	val7	14	1
7	8	18	28	val8	16	2
8	9	19	29	val9	18	0
9	10	20	30	val10	20	1

Rename columns

We can rename a selection of columns. Here we assume we don’t want to rename the index columns. Assuming we have no knowledge of the column names, we’ll read them from the file schema.

index_cols = ["x", "y", "z"]
col_names = pq.ParquetFile(parquet_file_path).schema.names
col_names

['x', 'y', 'z', 'a', 'b', 'c']

Create a mapping and rename the columns

new_col_names: dict[str, str] = {col: f"new_{col}" for col in col_names if col not in index_cols + ['c']}
output_file_path = parquet_file_path.parent / "renamed_data.parquet"
rename_and_update_metadata(input_path=parquet_file_path, output_path=output_file_path,
                           rename_map=new_col_names, show_progress=True)

Processing:   0%|          | 0/10 [00:00<?, ?rows/s]
Processing: 100%|██████████| 10/10 [00:00<00:00, 9358.11rows/s]

Read the renamed file and display it

df_renamed = pd.read_parquet(output_file_path)
df_renamed

	x	y	z	new_a	new_b	c
0	1	11	21	val1	2	1
1	2	12	22	val2	4	2
2	3	13	23	val3	6	0
3	4	14	24	val4	8	1
4	5	15	25	val5	10	2
5	6	16	26	val6	12	0
6	7	17	27	val7	14	1
7	8	18	28	val8	16	2
8	9	19	29	val9	18	0
9	10	20	30	val10	20	1

Update metadata

We can also update the metadata in the file. In this case we add descriptions to the renamed columns.

metadata = {
    "description": "This is the description of the dataset",
    "version": "0.1.0",
}
column_descriptions = {'new_a': {'description': "This is the a column renamed"},
                       'new_b': {'description': "This is the b column renamed"},
                       'c': {'description': "This is the original c column",
                             'unit_of_measure': "unitless"}}
rename_and_update_metadata(input_path=output_file_path, output_path=output_file_path,
                           rename_map=new_col_names, table_metadata=metadata, column_metadata=column_descriptions)

First the file metadata

d_metadata = pq.read_metadata(output_file_path)
d_metadata

<pyarrow._parquet.FileMetaData object at 0x7f8089d6e6b0>
  created_by: parquet-cpp-arrow version 20.0.0
  num_columns: 6
  num_rows: 10
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 1911

Now the table metadata

pf: pq.ParquetFile = pq.ParquetFile(output_file_path)
table_metadata = pf.schema.to_arrow_schema().metadata
decoded = {k.decode(): v.decode() for k, v in table_metadata.items()}
print(json.dumps(decoded, indent=2))

{
  "description": "This is the description of the dataset",
  "version": "0.1.0"
}

Now the column metadata

arrow_schema = pf.schema.to_arrow_schema()
# Extract column metadata
column_metadata = {col: arrow_schema.field(col).metadata for col in pf.schema.names}
# Decode metadata
column_metadata_decoded = {
    col: {k.decode(): v.decode() for k, v in meta.items()} if meta else {}
    for col, meta in column_metadata.items()
}
print(json.dumps(column_metadata_decoded, indent=2))

{
  "x": {},
  "y": {},
  "z": {},
  "new_a": {
    "description": "This is the a column renamed"
  },
  "new_b": {
    "description": "This is the b column renamed"
  },
  "c": {
    "description": "This is the original c column",
    "unit_of_measure": "unitless"
  }
}

Persisting only renamed columns

new_col_names = {"x": "x", "a": "new_a"}
output_file_path_renamed_only = parquet_file_path.parent / "renamed_data_only.parquet"
rename_and_update_metadata(input_path=parquet_file_path, output_path=output_file_path_renamed_only,
                           rename_map=new_col_names, return_all_columns=False)

df_renamed_only = pd.read_parquet(output_file_path_renamed_only)
df_renamed_only

	x	new_a
0	1	val1
1	2	val2
2	3	val3
3	4	val4
4	5	val5
5	6	val6
6	7	val7
7	8	val8
8	9	val9
9	10	val10

Total running time of the script: (0 minutes 0.018 seconds)

Gallery generated by Sphinx-Gallery