Alyosha11 commited on
Commit
9c642b1
·
verified ·
1 Parent(s): 2b62dc4

Upload csv_1.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. csv_1.py +43 -0
csv_1.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pyarrow.parquet as pq
3
+ from joblib import Parallel, delayed
4
+
5
+ def extract_text_from_parquet(parquet_file, output_dir):
6
+ # Read the Parquet file
7
+ table = pq.read_table(parquet_file)
8
+
9
+ # Convert the table to a Pandas DataFrame
10
+ df = table.to_pandas()
11
+
12
+ # Iterate over each row in the DataFrame
13
+ for _, row in df.iterrows():
14
+ doc_id = row['doc_id']
15
+ text = row['text']
16
+
17
+ # Create the output file path
18
+ output_file = os.path.join(output_dir, f"{doc_id}.txt")
19
+
20
+ # Write the text to the output file
21
+ with open(output_file, 'w', encoding='utf-8') as file:
22
+ file.write(text)
23
+
24
+ print(f"Extracted text for doc_id: {doc_id}")
25
+
26
+ def process_parquet_file(parquet_file, parquet_directory, output_directory):
27
+ parquet_file_path = os.path.join(parquet_directory, parquet_file)
28
+ extract_text_from_parquet(parquet_file_path, output_directory)
29
+
30
+ def main():
31
+ parquet_directory = 'hindi'
32
+ output_directory = 'txt/'
33
+
34
+ # Create the output directory if it doesn't exist
35
+ os.makedirs(output_directory, exist_ok=True)
36
+
37
+ # Get a list of all Parquet files in the directory
38
+ parquet_files = [file for file in os.listdir(parquet_directory) if file.endswith('.parquet')]
39
+
40
+ # Use joblib to parallelizes the extraction of text from Parquet files
41
+ Parallel(n_jobs=-1)(delayed(process_parquet_file)(parquet_file, parquet_directory, output_directory) for parquet_file in parquet_files)
42
+ if __name__ == '__main__':
43
+ main()