# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import shutil import os from datetime import timedelta import pyarrow as pa import pyarrow.dataset as ds import pyarrow.parquet.encryption as pe from pyarrow.tests.parquet.encryption import InMemoryKmsClient """ A sample to demonstrate parquet dataset encryption and decryption""" # create a list of dictionaries that will represent our dataset table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], 'n_legs': [2, 2, 4, 4, 5, 100], 'animal': ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]}) # create a PyArrow dataset from the table dataset = ds.dataset(table) FOOTER_KEY = b"0123456789112345" FOOTER_KEY_NAME = "footer_key" COL_KEY = b"1234567890123450" COL_KEY_NAME = "col_key" encryption_config = pe.EncryptionConfiguration( footer_key=FOOTER_KEY_NAME, plaintext_footer=False, # Use COL_KEY_NAME to encrypt `n_legs` and `animal` columns. column_keys={ COL_KEY_NAME: ["n_legs", "animal"], }, encryption_algorithm="AES_GCM_V1", # requires timedelta or an assertion is raised cache_lifetime=timedelta(minutes=5.0), data_key_length_bits=256) kms_connection_config = pe.KmsConnectionConfig( custom_kms_conf={ FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), COL_KEY_NAME: COL_KEY.decode("UTF-8"), } ) decryption_config = pe.DecryptionConfiguration(cache_lifetime=300) def kms_factory(kms_connection_configuration): return InMemoryKmsClient(kms_connection_configuration) crypto_factory = pe.CryptoFactory(kms_factory) parquet_encryption_cfg = ds.ParquetEncryptionConfig( crypto_factory, kms_connection_config, encryption_config) parquet_decryption_cfg = ds.ParquetDecryptionConfig(crypto_factory, kms_connection_config, decryption_config) # set encryption config for parquet fragment scan options pq_scan_opts = ds.ParquetFragmentScanOptions() pq_scan_opts.parquet_decryption_config = parquet_decryption_cfg pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) if os.path.exists('sample_dataset'): shutil.rmtree('sample_dataset') write_options = pformat.make_write_options( encryption_config=parquet_encryption_cfg) ds.write_dataset(data=dataset, base_dir="sample_dataset", partitioning=['year'], format=pformat, file_options=write_options) # read the dataset back dataset = ds.dataset('sample_dataset', format=pformat) # print the dataset print(dataset.to_table())