Renaming and Metadata

A simple example to demonstrate how to rename columns in a parquet file. Additionally, we can update the metadata in the file - in this case we add column descriptions.

import json
import tempfile

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path

from parq_tools import rename_and_update_metadata

Create a Parquet file

Create a temporary parquet file for demonstration

def create_parquet_file(file_path: Path):
    # Define the dataset
    data = {
        "x": range(1, 11),  # Index column
        "y": range(11, 21),  # Index column
        "z": range(21, 31),  # Index column
        "a": [f"val{i}" for i in range(1, 11)],  # Supplementary column
        "b": [i * 2 for i in range(1, 11)],  # Supplementary column
        "c": [i % 3 for i in range(1, 11)],  # Supplementary column
    }

    # Create a DataFrame
    df = pa.Table.from_pydict(data)

    # Write the DataFrame to a Parquet file
    pq.write_table(df, file_path)


parquet_file_path = Path(tempfile.gettempdir()) / "example_data.parquet"
create_parquet_file(parquet_file_path)

View the file as a DataFrame

df = pd.read_parquet(parquet_file_path)
df
x y z a b c
0 1 11 21 val1 2 1
1 2 12 22 val2 4 2
2 3 13 23 val3 6 0
3 4 14 24 val4 8 1
4 5 15 25 val5 10 2
5 6 16 26 val6 12 0
6 7 17 27 val7 14 1
7 8 18 28 val8 16 2
8 9 19 29 val9 18 0
9 10 20 30 val10 20 1


Rename columns

We can rename a selection of columns. Here we assume we don’t want to rename the index columns. Assuming we have no knowledge of the column names, we’ll read them from the file schema.

index_cols = ["x", "y", "z"]
col_names = pq.ParquetFile(parquet_file_path).schema.names
col_names
['x', 'y', 'z', 'a', 'b', 'c']

Create a mapping and rename the columns

new_col_names: dict[str, str] = {col: f"new_{col}" for col in col_names if col not in index_cols + ['c']}
output_file_path = parquet_file_path.parent / "renamed_data.parquet"
rename_and_update_metadata(input_path=parquet_file_path, output_path=output_file_path,
                           rename_map=new_col_names, show_progress=True)
Processing:   0%|          | 0/10 [00:00<?, ?rows/s]
Processing: 100%|██████████| 10/10 [00:00<00:00, 9358.11rows/s]

Read the renamed file and display it

df_renamed = pd.read_parquet(output_file_path)
df_renamed
x y z new_a new_b c
0 1 11 21 val1 2 1
1 2 12 22 val2 4 2
2 3 13 23 val3 6 0
3 4 14 24 val4 8 1
4 5 15 25 val5 10 2
5 6 16 26 val6 12 0
6 7 17 27 val7 14 1
7 8 18 28 val8 16 2
8 9 19 29 val9 18 0
9 10 20 30 val10 20 1


Update metadata

We can also update the metadata in the file. In this case we add descriptions to the renamed columns.

metadata = {
    "description": "This is the description of the dataset",
    "version": "0.1.0",
}
column_descriptions = {'new_a': {'description': "This is the a column renamed"},
                       'new_b': {'description': "This is the b column renamed"},
                       'c': {'description': "This is the original c column",
                             'unit_of_measure': "unitless"}}
rename_and_update_metadata(input_path=output_file_path, output_path=output_file_path,
                           rename_map=new_col_names, table_metadata=metadata, column_metadata=column_descriptions)

First the file metadata

d_metadata = pq.read_metadata(output_file_path)
d_metadata
<pyarrow._parquet.FileMetaData object at 0x7f8089d6e6b0>
  created_by: parquet-cpp-arrow version 20.0.0
  num_columns: 6
  num_rows: 10
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 1911

Now the table metadata

pf: pq.ParquetFile = pq.ParquetFile(output_file_path)
table_metadata = pf.schema.to_arrow_schema().metadata
decoded = {k.decode(): v.decode() for k, v in table_metadata.items()}
print(json.dumps(decoded, indent=2))
{
  "description": "This is the description of the dataset",
  "version": "0.1.0"
}

Now the column metadata

arrow_schema = pf.schema.to_arrow_schema()
# Extract column metadata
column_metadata = {col: arrow_schema.field(col).metadata for col in pf.schema.names}
# Decode metadata
column_metadata_decoded = {
    col: {k.decode(): v.decode() for k, v in meta.items()} if meta else {}
    for col, meta in column_metadata.items()
}
print(json.dumps(column_metadata_decoded, indent=2))
{
  "x": {},
  "y": {},
  "z": {},
  "new_a": {
    "description": "This is the a column renamed"
  },
  "new_b": {
    "description": "This is the b column renamed"
  },
  "c": {
    "description": "This is the original c column",
    "unit_of_measure": "unitless"
  }
}

Persisting only renamed columns

new_col_names = {"x": "x", "a": "new_a"}
output_file_path_renamed_only = parquet_file_path.parent / "renamed_data_only.parquet"
rename_and_update_metadata(input_path=parquet_file_path, output_path=output_file_path_renamed_only,
                           rename_map=new_col_names, return_all_columns=False)

df_renamed_only = pd.read_parquet(output_file_path_renamed_only)
df_renamed_only
x new_a
0 1 val1
1 2 val2
2 3 val3
3 4 val4
4 5 val5
5 6 val6
6 7 val7
7 8 val8
8 9 val9
9 10 val10


Total running time of the script: (0 minutes 0.018 seconds)

Gallery generated by Sphinx-Gallery