J.A.R.V.I.S / drive_processor.py
varun324242's picture
Upload folder using huggingface_hub
fe2a0f2 verified
import pandas as pd
import os
from google.colab import drive
import logging
from datetime import datetime
from tqdm import tqdm
import time
import csv
def setup_logging():
"""Setup logging configuration"""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_dir = 'logs'
os.makedirs(f"{log_dir}/process", exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s | %(levelname)s | %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler(f'{log_dir}/process/process_{timestamp}.log')
]
)
return timestamp
def mount_google_drive():
"""Mount Google Drive and return base path"""
try:
drive.mount('/content/drive')
base_drive = "/content/drive/MyDrive"
logging.info("Google Drive mounted successfully")
return base_drive
except Exception as e:
logging.error(f"Failed to mount Google Drive: {e}")
raise
def get_file_paths(base_drive):
"""Define input and output file paths"""
return {
"Temp22": {
"input": f"{base_drive}/ss/temp22.csv",
"output": f"{base_drive}/jj.csv"
},
"Temp2": {
"input": f"{base_drive}/ss/temp2.csv",
"output": f"{base_drive}/jj.csv"
},
"Temp3": {
"input": f"{base_drive}/ss/temp3.csv",
"output": f"{base_drive}/jj.csv"
},
"Temp4": {
"input": f"{base_drive}/ss/temp4.csv",
"output": f"{base_drive}/jj.csv"
},
"Tem3": {
"input": f"{base_drive}/ss/tem3.csv",
"output": f"{base_drive}/jj.csv"
},
"Debit": {
"input": f"{base_drive}/Debit.csv",
"output": f"{base_drive}/jj.csv"
},
"Fraud": {
"input": f"{base_drive}/Fraud.csv",
"output": f"{base_drive}/jj.csv"
},
"Impersonating_Email": {
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Impersonating_Email_messages_20241027_125715.csv",
"output": f"{base_drive}/jj.csv"
},
"Cyber_Bullying": {
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Cyber_Bullying__Stalking__Sexting_messages_20241027_192454.csv",
"output": f"{base_drive}/jj.csv"
},
"Profile_Hacking": {
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Profile_Hacking_Identity_Theft_messages_20241027_192454.csv",
"output": f"{base_drive}/jj.csv"
},
"Cheating_Impersonation": {
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Cheating_by_Impersonation_messages_20241027_192454.csv",
"output": f"{base_drive}/jj.csv"
},
"Fake_Profile": {
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_FakeImpersonating_Profile_messages_20241028_062409.csv",
"output": f"{base_drive}/jj.csv"
},
"Provocative_Speech": {
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Provocative_Speech_for_unlawful_acts_messages_20241027_110606.csv",
"output": f"{base_drive}/jj.csv"
},
"Matrimonial_Fraud": {
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Online_Matrimonial_Fraud_messages_20241027_125715.csv",
"output": f"{base_drive}/jj.csv"
},
"Email_Phishing": {
"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_EMail_Phishing_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Intimidating_Email": {
"input": f"{base_drive}/converted_iIntimidating_Email_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Call_Vishing": {
"input": f"{base_drive}/subcategory_messages/Online_Financial_Fraud/converted_Fraud_CallVishing_messages_20241028_105333.csv",
"output": f"{base_drive}/jj.csv"
},
"Business_Email_Compromise": {
"input": f"{base_drive}/subcategory_messages/Online_Financial_Fraud/converted_Business_Email_CompromiseEmail_Takeover_messages_20241027_110606.csv",
"output": f"{base_drive}/jj.csv"
},
"Demat_Fraud": {
"input": f"{base_drive}/subcategory_messages/Online_Financial_Fraud/converted_DematDepository_Fraud_messages_20241027_110606.csv",
"output": f"{base_drive}/jj.csv"
},
"Online_Gambling": {
"input": f"{base_drive}/category_messages/converted_Online_Gambling__Betting_messages_20241028_070304.csv",
"output": f"{base_drive}/jj.csv"
},
"Data_Breach": {
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Data_Breach_Theft_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"DDOS_Attacks": {
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Denial_of_Service_(DoS)_Distributed_Denial_of_Service_(DDOS)_attacks_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Malware_Attack": {
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Malware_Attack_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Hacking_Defacement": {
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Hacking_Defacement_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"SQL_Injection": {
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_SQL_Injection_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Ransomware_Attack": {
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Ransomware_Attack_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Source_Tampering": {
"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Tampering_with_computer_source_documents_messages_20241028_070304.csv",
"output": f"{base_drive}/jj.csv"
},
"Cryptocurrency_Fraud": {
"input": f"{base_drive}/subcategory_messages/Cryptocurrency_Crime/converted_Cryptocurrency_Fraud_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Email_Hacking": {
"input": f"{base_drive}/subcategory_messages/Hacking__Damage_to_computercomputer_system_etc/converted_Email_Hacking_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Unauthorised_Access": {
"input": f"{base_drive}/subcategory_messages/Hacking__Damage_to_computercomputer_system_etc/converted_Unauthorised_AccessData_Breach_messages_20241028_143135.csv",
"output": f"{base_drive}/jj.csv"
},
"Website_Defacement": {
"input": f"{base_drive}/subcategory_messages/Hacking__Damage_to_computercomputer_system_etc/converted_Website_DefacementHacking_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Computer_Damage": {
"input": f"{base_drive}/subcategory_messages/Hacking__Damage_to_computercomputer_system_etc/converted_Damage_to_computer_computer_systems_etc_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Cyber_Terrorism": {
"input": f"{base_drive}/subcategory_messages/Cyber_Terrorism/converted_Cyber_Terrorism_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Online_Trafficking": {
"input": f"{base_drive}/subcategory_messages/Online_Cyber_Trafficking/converted_Online_Trafficking_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Ransomware": {
"input": f"{base_drive}/subcategory_messages/Ransomware/converted_Ransomware_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
},
"Against_Sovereignty": {
"input": f"{base_drive}/subcategory_messages/Report_Unlawful_Content/converted_Against_Interest_of_sovereignty_or_integrity_of_India_messages_20241027_130322.csv",
"output": f"{base_drive}/jj.csv"
}
}
def process_file(input_path, output_path):
"""Process a single file and update the output CSV"""
try:
logging.info(f"\nProcessing file: {input_path}")
# Read input file with converted messages
input_df = pd.read_csv(input_path)
logging.info(f"Input file loaded: {len(input_df)} rows")
# Check if output file exists
if os.path.exists(output_path):
output_df = pd.read_csv(output_path)
logging.info(f"Existing output file loaded: {len(output_df)} rows")
else:
# Create new output file with only required columns
output_df = pd.DataFrame(columns=['v1', 'v2'])
logging.info("Created new output file")
# Track statistics
stats = {
'total_processed': 0,
'new_entries': 0,
'updates': 0,
'start_time': time.time()
}
# Process each row
with tqdm(total=len(input_df), desc="Processing messages") as pbar:
for _, row in input_df.iterrows():
try:
# Create new entry with only v1 and v2
new_row = {
'v1': 'ham', # Set v1 as 'ham'
'v2': row['converted_message'] # Get converted message from input
}
output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True)
stats['new_entries'] += 1
stats['total_processed'] += 1
except Exception as e:
logging.error(f"Error processing row: {e}")
continue
pbar.update(1)
# Save periodically
if stats['total_processed'] % 100 == 0:
output_df.to_csv(output_path, index=False)
logging.info(f"Periodic save: {stats['total_processed']} messages processed")
# Final save
output_df.to_csv(output_path, index=False)
# Calculate statistics
processing_time = time.time() - stats['start_time']
avg_time = processing_time / max(stats['total_processed'], 1) # Avoid division by zero
logging.info(f"""
Processing Complete:
- Total Processed: {stats['total_processed']}
- New Entries: {stats['new_entries']}
- Updates: {stats['updates']}
- Processing Time: {processing_time:.2f} seconds
- Average Time per Message: {avg_time:.2f} seconds
Output File: {output_path}
""")
return stats
except Exception as e:
logging.error(f"Error processing file {input_path}: {e}")
raise
def generate_report(file_stats, timestamp):
"""Generate processing report"""
report_dir = 'reports'
os.makedirs(report_dir, exist_ok=True)
report_path = f"{report_dir}/processing_report_{timestamp}.txt"
with open(report_path, 'w') as f:
f.write(f"""
Processing Report - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
================================================================
Files Processed:
""")
total_messages = 0
total_new = 0
total_updates = 0
total_time = 0
for file_name, stats in file_stats.items():
f.write(f"""
{file_name}:
- Total Processed: {stats['total_processed']}
- New Entries: {stats['new_entries']}
- Updates: {stats['updates']}
- Processing Time: {time.time() - stats['start_time']:.2f} seconds
""")
total_messages += stats['total_processed']
total_new += stats['new_entries']
total_updates += stats['updates']
total_time += time.time() - stats['start_time']
f.write(f"""
================================================================
Summary:
- Total Messages Processed: {total_messages}
- Total New Entries: {total_new}
- Total Updates: {total_updates}
- Total Processing Time: {total_time:.2f} seconds
- Average Time per Message: {total_time/total_messages:.2f} seconds
================================================================
""")
logging.info(f"Report generated: {report_path}")
def main():
timestamp = setup_logging()
logging.info("Starting processing...")
try:
# Mount Google Drive
base_drive = mount_google_drive()
# Get file paths
file_paths = get_file_paths(base_drive)
# Process each file
file_stats = {}
for file_name, paths in file_paths.items():
logging.info(f"\nProcessing {file_name}...")
try:
stats = process_file(paths['input'], paths['output'])
file_stats[file_name] = stats
except Exception as e:
logging.error(f"Error processing {file_name}: {e}")
continue
# Generate report
generate_report(file_stats, timestamp)
logging.info("Processing complete!")
except Exception as e:
logging.error(f"Critical error: {e}")
raise
if __name__ == "__main__":
main()