Spaces:
Sleeping
Sleeping
import pandas as pd | |
import os | |
from google.colab import drive | |
import logging | |
from datetime import datetime | |
from tqdm import tqdm | |
import time | |
import csv | |
def setup_logging(): | |
"""Setup logging configuration""" | |
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
log_dir = 'logs' | |
os.makedirs(f"{log_dir}/process", exist_ok=True) | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s | %(levelname)s | %(message)s', | |
handlers=[ | |
logging.StreamHandler(), | |
logging.FileHandler(f'{log_dir}/process/process_{timestamp}.log') | |
] | |
) | |
return timestamp | |
def mount_google_drive(): | |
"""Mount Google Drive and return base path""" | |
try: | |
drive.mount('/content/drive') | |
base_drive = "/content/drive/MyDrive" | |
logging.info("Google Drive mounted successfully") | |
return base_drive | |
except Exception as e: | |
logging.error(f"Failed to mount Google Drive: {e}") | |
raise | |
def get_file_paths(base_drive): | |
"""Define input and output file paths""" | |
return { | |
"Temp22": { | |
"input": f"{base_drive}/ss/temp22.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Temp2": { | |
"input": f"{base_drive}/ss/temp2.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Temp3": { | |
"input": f"{base_drive}/ss/temp3.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Temp4": { | |
"input": f"{base_drive}/ss/temp4.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Tem3": { | |
"input": f"{base_drive}/ss/tem3.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Debit": { | |
"input": f"{base_drive}/Debit.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Fraud": { | |
"input": f"{base_drive}/Fraud.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Impersonating_Email": { | |
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Impersonating_Email_messages_20241027_125715.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Cyber_Bullying": { | |
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Cyber_Bullying__Stalking__Sexting_messages_20241027_192454.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Profile_Hacking": { | |
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Profile_Hacking_Identity_Theft_messages_20241027_192454.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Cheating_Impersonation": { | |
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Cheating_by_Impersonation_messages_20241027_192454.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Fake_Profile": { | |
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_FakeImpersonating_Profile_messages_20241028_062409.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Provocative_Speech": { | |
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Provocative_Speech_for_unlawful_acts_messages_20241027_110606.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Matrimonial_Fraud": { | |
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Online_Matrimonial_Fraud_messages_20241027_125715.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Email_Phishing": { | |
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_EMail_Phishing_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Intimidating_Email": { | |
"input": f"{base_drive}/converted_iIntimidating_Email_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Call_Vishing": { | |
"input": f"{base_drive}/subcategory_messages/Online_Financial_Fraud/converted_Fraud_CallVishing_messages_20241028_105333.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Business_Email_Compromise": { | |
"input": f"{base_drive}/subcategory_messages/Online_Financial_Fraud/converted_Business_Email_CompromiseEmail_Takeover_messages_20241027_110606.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Demat_Fraud": { | |
"input": f"{base_drive}/subcategory_messages/Online_Financial_Fraud/converted_DematDepository_Fraud_messages_20241027_110606.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Online_Gambling": { | |
"input": f"{base_drive}/category_messages/converted_Online_Gambling__Betting_messages_20241028_070304.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Data_Breach": { | |
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Data_Breach_Theft_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"DDOS_Attacks": { | |
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Denial_of_Service_(DoS)_Distributed_Denial_of_Service_(DDOS)_attacks_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Malware_Attack": { | |
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Malware_Attack_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Hacking_Defacement": { | |
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Hacking_Defacement_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"SQL_Injection": { | |
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_SQL_Injection_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Ransomware_Attack": { | |
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Ransomware_Attack_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Source_Tampering": { | |
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Tampering_with_computer_source_documents_messages_20241028_070304.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Cryptocurrency_Fraud": { | |
"input": f"{base_drive}/subcategory_messages/Cryptocurrency_Crime/converted_Cryptocurrency_Fraud_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Email_Hacking": { | |
"input": f"{base_drive}/subcategory_messages/Hacking__Damage_to_computercomputer_system_etc/converted_Email_Hacking_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Unauthorised_Access": { | |
"input": f"{base_drive}/subcategory_messages/Hacking__Damage_to_computercomputer_system_etc/converted_Unauthorised_AccessData_Breach_messages_20241028_143135.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Website_Defacement": { | |
"input": f"{base_drive}/subcategory_messages/Hacking__Damage_to_computercomputer_system_etc/converted_Website_DefacementHacking_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Computer_Damage": { | |
"input": f"{base_drive}/subcategory_messages/Hacking__Damage_to_computercomputer_system_etc/converted_Damage_to_computer_computer_systems_etc_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Cyber_Terrorism": { | |
"input": f"{base_drive}/subcategory_messages/Cyber_Terrorism/converted_Cyber_Terrorism_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Online_Trafficking": { | |
"input": f"{base_drive}/subcategory_messages/Online_Cyber_Trafficking/converted_Online_Trafficking_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Ransomware": { | |
"input": f"{base_drive}/subcategory_messages/Ransomware/converted_Ransomware_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
}, | |
"Against_Sovereignty": { | |
"input": f"{base_drive}/subcategory_messages/Report_Unlawful_Content/converted_Against_Interest_of_sovereignty_or_integrity_of_India_messages_20241027_130322.csv", | |
"output": f"{base_drive}/jj.csv" | |
} | |
} | |
def process_file(input_path, output_path): | |
"""Process a single file and update the output CSV""" | |
try: | |
logging.info(f"\nProcessing file: {input_path}") | |
# Read input file with converted messages | |
input_df = pd.read_csv(input_path) | |
logging.info(f"Input file loaded: {len(input_df)} rows") | |
# Check if output file exists | |
if os.path.exists(output_path): | |
output_df = pd.read_csv(output_path) | |
logging.info(f"Existing output file loaded: {len(output_df)} rows") | |
else: | |
# Create new output file with only required columns | |
output_df = pd.DataFrame(columns=['v1', 'v2']) | |
logging.info("Created new output file") | |
# Track statistics | |
stats = { | |
'total_processed': 0, | |
'new_entries': 0, | |
'updates': 0, | |
'start_time': time.time() | |
} | |
# Process each row | |
with tqdm(total=len(input_df), desc="Processing messages") as pbar: | |
for _, row in input_df.iterrows(): | |
try: | |
# Create new entry with only v1 and v2 | |
new_row = { | |
'v1': 'ham', # Set v1 as 'ham' | |
'v2': row['converted_message'] # Get converted message from input | |
} | |
output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True) | |
stats['new_entries'] += 1 | |
stats['total_processed'] += 1 | |
except Exception as e: | |
logging.error(f"Error processing row: {e}") | |
continue | |
pbar.update(1) | |
# Save periodically | |
if stats['total_processed'] % 100 == 0: | |
output_df.to_csv(output_path, index=False) | |
logging.info(f"Periodic save: {stats['total_processed']} messages processed") | |
# Final save | |
output_df.to_csv(output_path, index=False) | |
# Calculate statistics | |
processing_time = time.time() - stats['start_time'] | |
avg_time = processing_time / max(stats['total_processed'], 1) # Avoid division by zero | |
logging.info(f""" | |
Processing Complete: | |
- Total Processed: {stats['total_processed']} | |
- New Entries: {stats['new_entries']} | |
- Updates: {stats['updates']} | |
- Processing Time: {processing_time:.2f} seconds | |
- Average Time per Message: {avg_time:.2f} seconds | |
Output File: {output_path} | |
""") | |
return stats | |
except Exception as e: | |
logging.error(f"Error processing file {input_path}: {e}") | |
raise | |
def generate_report(file_stats, timestamp): | |
"""Generate processing report""" | |
report_dir = 'reports' | |
os.makedirs(report_dir, exist_ok=True) | |
report_path = f"{report_dir}/processing_report_{timestamp}.txt" | |
with open(report_path, 'w') as f: | |
f.write(f""" | |
Processing Report - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
================================================================ | |
Files Processed: | |
""") | |
total_messages = 0 | |
total_new = 0 | |
total_updates = 0 | |
total_time = 0 | |
for file_name, stats in file_stats.items(): | |
f.write(f""" | |
{file_name}: | |
- Total Processed: {stats['total_processed']} | |
- New Entries: {stats['new_entries']} | |
- Updates: {stats['updates']} | |
- Processing Time: {time.time() - stats['start_time']:.2f} seconds | |
""") | |
total_messages += stats['total_processed'] | |
total_new += stats['new_entries'] | |
total_updates += stats['updates'] | |
total_time += time.time() - stats['start_time'] | |
f.write(f""" | |
================================================================ | |
Summary: | |
- Total Messages Processed: {total_messages} | |
- Total New Entries: {total_new} | |
- Total Updates: {total_updates} | |
- Total Processing Time: {total_time:.2f} seconds | |
- Average Time per Message: {total_time/total_messages:.2f} seconds | |
================================================================ | |
""") | |
logging.info(f"Report generated: {report_path}") | |
def main(): | |
timestamp = setup_logging() | |
logging.info("Starting processing...") | |
try: | |
# Mount Google Drive | |
base_drive = mount_google_drive() | |
# Get file paths | |
file_paths = get_file_paths(base_drive) | |
# Process each file | |
file_stats = {} | |
for file_name, paths in file_paths.items(): | |
logging.info(f"\nProcessing {file_name}...") | |
try: | |
stats = process_file(paths['input'], paths['output']) | |
file_stats[file_name] = stats | |
except Exception as e: | |
logging.error(f"Error processing {file_name}: {e}") | |
continue | |
# Generate report | |
generate_report(file_stats, timestamp) | |
logging.info("Processing complete!") | |
except Exception as e: | |
logging.error(f"Critical error: {e}") | |
raise | |
if __name__ == "__main__": | |
main() |