|
import os
|
|
import pandas as pd
|
|
from PIL import Image, UnidentifiedImageError
|
|
import torch
|
|
from torchvision import transforms
|
|
from transformers import AutoProcessor, FocalNetForImageClassification
|
|
import pyarrow as pa
|
|
import pyarrow.parquet as pq
|
|
|
|
|
|
image_folder = "scraped_images"
|
|
model_path = "MichalMlodawski/nsfw-image-detection-large"
|
|
|
|
|
|
jpg_files = []
|
|
for root, dirs, files in os.walk(image_folder):
|
|
for file in files:
|
|
if file.lower().endswith(".jpg"):
|
|
jpg_files.append(os.path.join(root, file))
|
|
|
|
|
|
if not jpg_files:
|
|
print("No jpg files found in folder:", image_folder)
|
|
exit()
|
|
|
|
|
|
feature_extractor = AutoProcessor.from_pretrained(model_path)
|
|
model = FocalNetForImageClassification.from_pretrained(model_path)
|
|
model.eval()
|
|
|
|
|
|
transform = transforms.Compose([
|
|
transforms.Resize((512, 512)),
|
|
transforms.ToTensor(),
|
|
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
|
])
|
|
|
|
|
|
label_to_category = {
|
|
"LABEL_0": "Safe",
|
|
"LABEL_1": "Questionable",
|
|
"LABEL_2": "Unsafe"
|
|
}
|
|
|
|
|
|
results = []
|
|
|
|
|
|
error_log = "error_log.txt"
|
|
|
|
|
|
for jpg_file in jpg_files:
|
|
try:
|
|
|
|
image = Image.open(jpg_file).convert("RGB")
|
|
except UnidentifiedImageError:
|
|
|
|
with open(error_log, "a", encoding="utf-8") as log_file:
|
|
log_file.write(f"Unidentified image file: {jpg_file}. Skipping...\n")
|
|
print(f"Unidentified image file: {jpg_file}. Skipping...")
|
|
continue
|
|
|
|
image_tensor = transform(image).unsqueeze(0)
|
|
|
|
|
|
inputs = feature_extractor(images=image, return_tensors="pt")
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
|
confidence, predicted = torch.max(probabilities, 1)
|
|
|
|
|
|
label = model.config.id2label[predicted.item()]
|
|
category = label_to_category.get(label, "Unknown")
|
|
|
|
|
|
results.append({
|
|
"file_path": jpg_file,
|
|
"label": label,
|
|
"category": category,
|
|
"confidence": confidence.item() * 100
|
|
})
|
|
|
|
|
|
df = pd.DataFrame(results)
|
|
|
|
|
|
parquet_file = "nsfw_classification_results.parquet"
|
|
table = pa.Table.from_pandas(df)
|
|
pq.write_table(table, parquet_file)
|
|
|
|
print(f"Classification completed and saved to {parquet_file}!")
|
|
|