Phoneme / csv_1.py
Alyosha11's picture
Upload csv_1.py with huggingface_hub
9c642b1 verified
raw
history blame
1.55 kB
import os
import pyarrow.parquet as pq
from joblib import Parallel, delayed
def extract_text_from_parquet(parquet_file, output_dir):
# Read the Parquet file
table = pq.read_table(parquet_file)
# Convert the table to a Pandas DataFrame
df = table.to_pandas()
# Iterate over each row in the DataFrame
for _, row in df.iterrows():
doc_id = row['doc_id']
text = row['text']
# Create the output file path
output_file = os.path.join(output_dir, f"{doc_id}.txt")
# Write the text to the output file
with open(output_file, 'w', encoding='utf-8') as file:
file.write(text)
print(f"Extracted text for doc_id: {doc_id}")
def process_parquet_file(parquet_file, parquet_directory, output_directory):
parquet_file_path = os.path.join(parquet_directory, parquet_file)
extract_text_from_parquet(parquet_file_path, output_directory)
def main():
parquet_directory = 'hindi'
output_directory = 'txt/'
# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)
# Get a list of all Parquet files in the directory
parquet_files = [file for file in os.listdir(parquet_directory) if file.endswith('.parquet')]
# Use joblib to parallelizes the extraction of text from Parquet files
Parallel(n_jobs=-1)(delayed(process_parquet_file)(parquet_file, parquet_directory, output_directory) for parquet_file in parquet_files)
if __name__ == '__main__':
main()