Last commit not found
import os | |
import pyarrow.parquet as pq | |
from joblib import Parallel, delayed | |
def extract_text_from_parquet(parquet_file, output_dir): | |
# Read the Parquet file | |
table = pq.read_table(parquet_file) | |
# Convert the table to a Pandas DataFrame | |
df = table.to_pandas() | |
# Iterate over each row in the DataFrame | |
for _, row in df.iterrows(): | |
doc_id = row['doc_id'] | |
text = row['text'] | |
# Create the output file path | |
output_file = os.path.join(output_dir, f"{doc_id}.txt") | |
# Write the text to the output file | |
with open(output_file, 'w', encoding='utf-8') as file: | |
file.write(text) | |
print(f"Extracted text for doc_id: {doc_id}") | |
def process_parquet_file(parquet_file, parquet_directory, output_directory): | |
parquet_file_path = os.path.join(parquet_directory, parquet_file) | |
extract_text_from_parquet(parquet_file_path, output_directory) | |
def main(): | |
parquet_directory = 'hindi' | |
output_directory = 'txt/' | |
# Create the output directory if it doesn't exist | |
os.makedirs(output_directory, exist_ok=True) | |
# Get a list of all Parquet files in the directory | |
parquet_files = [file for file in os.listdir(parquet_directory) if file.endswith('.parquet')] | |
# Use joblib to parallelizes the extraction of text from Parquet files | |
Parallel(n_jobs=-1)(delayed(process_parquet_file)(parquet_file, parquet_directory, output_directory) for parquet_file in parquet_files) | |
if __name__ == '__main__': | |
main() | |