Spaces:

varun324242
/

J.A.R.V.I.S

Sleeping

App Files Files Community

J.A.R.V.I.S / drive_processor.py

varun324242

Upload folder using huggingface_hub

fe2a0f2 verified 5 months ago

raw

history blame contribute delete

14.4 kB

	import pandas as pd
	import os
	from google.colab import drive
	import logging
	from datetime import datetime
	from tqdm import tqdm
	import time
	import csv

	def setup_logging():
	"""Setup logging configuration"""
	timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
	log_dir = 'logs'
	os.makedirs(f"{log_dir}/process", exist_ok=True)

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s \| %(levelname)s \| %(message)s',
	handlers=[
	logging.StreamHandler(),
	logging.FileHandler(f'{log_dir}/process/process_{timestamp}.log')
	]
	)
	return timestamp

	def mount_google_drive():
	"""Mount Google Drive and return base path"""
	try:
	drive.mount('/content/drive')
	base_drive = "/content/drive/MyDrive"
	logging.info("Google Drive mounted successfully")
	return base_drive
	except Exception as e:
	logging.error(f"Failed to mount Google Drive: {e}")
	raise

	def get_file_paths(base_drive):
	"""Define input and output file paths"""
	return {
	"Temp22": {
	"input": f"{base_drive}/ss/temp22.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Temp2": {
	"input": f"{base_drive}/ss/temp2.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Temp3": {
	"input": f"{base_drive}/ss/temp3.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Temp4": {
	"input": f"{base_drive}/ss/temp4.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Tem3": {
	"input": f"{base_drive}/ss/tem3.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Debit": {
	"input": f"{base_drive}/Debit.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Fraud": {
	"input": f"{base_drive}/Fraud.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Impersonating_Email": {
	"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Impersonating_Email_messages_20241027_125715.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Cyber_Bullying": {
	"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Cyber_Bullying__Stalking__Sexting_messages_20241027_192454.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Profile_Hacking": {
	"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Profile_Hacking_Identity_Theft_messages_20241027_192454.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Cheating_Impersonation": {
	"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Cheating_by_Impersonation_messages_20241027_192454.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Fake_Profile": {
	"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_FakeImpersonating_Profile_messages_20241028_062409.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Provocative_Speech": {
	"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Provocative_Speech_for_unlawful_acts_messages_20241027_110606.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Matrimonial_Fraud": {
	"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_Online_Matrimonial_Fraud_messages_20241027_125715.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Email_Phishing": {
	"input": f"{base_drive}/subcategory_messages/Online_and_Social_Media_Related_Crime/converted_EMail_Phishing_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Intimidating_Email": {
	"input": f"{base_drive}/converted_iIntimidating_Email_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Call_Vishing": {
	"input": f"{base_drive}/subcategory_messages/Online_Financial_Fraud/converted_Fraud_CallVishing_messages_20241028_105333.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Business_Email_Compromise": {
	"input": f"{base_drive}/subcategory_messages/Online_Financial_Fraud/converted_Business_Email_CompromiseEmail_Takeover_messages_20241027_110606.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Demat_Fraud": {
	"input": f"{base_drive}/subcategory_messages/Online_Financial_Fraud/converted_DematDepository_Fraud_messages_20241027_110606.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Online_Gambling": {
	"input": f"{base_drive}/category_messages/converted_Online_Gambling__Betting_messages_20241028_070304.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Data_Breach": {
	"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Data_Breach_Theft_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"DDOS_Attacks": {
	"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Denial_of_Service_(DoS)_Distributed_Denial_of_Service_(DDOS)_attacks_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Malware_Attack": {
	"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Malware_Attack_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Hacking_Defacement": {
	"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Hacking_Defacement_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"SQL_Injection": {
	"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_SQL_Injection_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Ransomware_Attack": {
	"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Ransomware_Attack_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Source_Tampering": {
	"input": f"{base_drive}/subcategory_messages/Cyber_Attack__Dependent_Crimes/converted_Tampering_with_computer_source_documents_messages_20241028_070304.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Cryptocurrency_Fraud": {
	"input": f"{base_drive}/subcategory_messages/Cryptocurrency_Crime/converted_Cryptocurrency_Fraud_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Email_Hacking": {
	"input": f"{base_drive}/subcategory_messages/Hacking__Damage_to_computercomputer_system_etc/converted_Email_Hacking_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Unauthorised_Access": {
	"input": f"{base_drive}/subcategory_messages/Hacking__Damage_to_computercomputer_system_etc/converted_Unauthorised_AccessData_Breach_messages_20241028_143135.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Website_Defacement": {
	"input": f"{base_drive}/subcategory_messages/Hacking__Damage_to_computercomputer_system_etc/converted_Website_DefacementHacking_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Computer_Damage": {
	"input": f"{base_drive}/subcategory_messages/Hacking__Damage_to_computercomputer_system_etc/converted_Damage_to_computer_computer_systems_etc_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Cyber_Terrorism": {
	"input": f"{base_drive}/subcategory_messages/Cyber_Terrorism/converted_Cyber_Terrorism_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Online_Trafficking": {
	"input": f"{base_drive}/subcategory_messages/Online_Cyber_Trafficking/converted_Online_Trafficking_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Ransomware": {
	"input": f"{base_drive}/subcategory_messages/Ransomware/converted_Ransomware_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	},
	"Against_Sovereignty": {
	"input": f"{base_drive}/subcategory_messages/Report_Unlawful_Content/converted_Against_Interest_of_sovereignty_or_integrity_of_India_messages_20241027_130322.csv",
	"output": f"{base_drive}/jj.csv"
	}
	}

	def process_file(input_path, output_path):
	"""Process a single file and update the output CSV"""
	try:
	logging.info(f"\nProcessing file: {input_path}")

	# Read input file with converted messages
	input_df = pd.read_csv(input_path)
	logging.info(f"Input file loaded: {len(input_df)} rows")

	# Check if output file exists
	if os.path.exists(output_path):
	output_df = pd.read_csv(output_path)
	logging.info(f"Existing output file loaded: {len(output_df)} rows")
	else:
	# Create new output file with only required columns
	output_df = pd.DataFrame(columns=['v1', 'v2'])
	logging.info("Created new output file")

	# Track statistics
	stats = {
	'total_processed': 0,
	'new_entries': 0,
	'updates': 0,
	'start_time': time.time()
	}

	# Process each row
	with tqdm(total=len(input_df), desc="Processing messages") as pbar:
	for _, row in input_df.iterrows():
	try:
	# Create new entry with only v1 and v2
	new_row = {
	'v1': 'ham', # Set v1 as 'ham'
	'v2': row['converted_message'] # Get converted message from input
	}
	output_df = pd.concat([output_df, pd.DataFrame([new_row])], ignore_index=True)
	stats['new_entries'] += 1
	stats['total_processed'] += 1

	except Exception as e:
	logging.error(f"Error processing row: {e}")
	continue

	pbar.update(1)

	# Save periodically
	if stats['total_processed'] % 100 == 0:
	output_df.to_csv(output_path, index=False)
	logging.info(f"Periodic save: {stats['total_processed']} messages processed")

	# Final save
	output_df.to_csv(output_path, index=False)

	# Calculate statistics
	processing_time = time.time() - stats['start_time']
	avg_time = processing_time / max(stats['total_processed'], 1) # Avoid division by zero

	logging.info(f"""
	Processing Complete:
	- Total Processed: {stats['total_processed']}
	- New Entries: {stats['new_entries']}
	- Updates: {stats['updates']}
	- Processing Time: {processing_time:.2f} seconds
	- Average Time per Message: {avg_time:.2f} seconds
	Output File: {output_path}
	""")

	return stats

	except Exception as e:
	logging.error(f"Error processing file {input_path}: {e}")
	raise

	def generate_report(file_stats, timestamp):
	"""Generate processing report"""
	report_dir = 'reports'
	os.makedirs(report_dir, exist_ok=True)

	report_path = f"{report_dir}/processing_report_{timestamp}.txt"

	with open(report_path, 'w') as f:
	f.write(f"""
	Processing Report - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
	================================================================

	Files Processed:
	""")

	total_messages = 0
	total_new = 0
	total_updates = 0
	total_time = 0

	for file_name, stats in file_stats.items():
	f.write(f"""
	{file_name}:
	- Total Processed: {stats['total_processed']}
	- New Entries: {stats['new_entries']}
	- Updates: {stats['updates']}
	- Processing Time: {time.time() - stats['start_time']:.2f} seconds
	""")

	total_messages += stats['total_processed']
	total_new += stats['new_entries']
	total_updates += stats['updates']
	total_time += time.time() - stats['start_time']

	f.write(f"""
	================================================================
	Summary:
	- Total Messages Processed: {total_messages}
	- Total New Entries: {total_new}
	- Total Updates: {total_updates}
	- Total Processing Time: {total_time:.2f} seconds
	- Average Time per Message: {total_time/total_messages:.2f} seconds
	================================================================
	""")

	logging.info(f"Report generated: {report_path}")

	def main():
	timestamp = setup_logging()
	logging.info("Starting processing...")

	try:
	# Mount Google Drive
	base_drive = mount_google_drive()

	# Get file paths
	file_paths = get_file_paths(base_drive)

	# Process each file
	file_stats = {}
	for file_name, paths in file_paths.items():
	logging.info(f"\nProcessing {file_name}...")

	try:
	stats = process_file(paths['input'], paths['output'])
	file_stats[file_name] = stats
	except Exception as e:
	logging.error(f"Error processing {file_name}: {e}")
	continue

	# Generate report
	generate_report(file_stats, timestamp)

	logging.info("Processing complete!")

	except Exception as e:
	logging.error(f"Critical error: {e}")
	raise

	if __name__ == "__main__":
	main()