#通过modelscope接口对问题图片予以删除,保证过审 import base64 import json import os from io import BytesIO import pandas as pd from PIL import Image import requests def ocr(image): image = Image.open(image) img_buffer = BytesIO() image.save(img_buffer, format=image.format) byte_data = img_buffer.getvalue() base64_bytes = base64.b64encode(byte_data) # bytes base64_str = base64_bytes.decode() url = "https://www.modelscope.cn/api/v1/studio/damo/ofa_ocr_pipeline/gradio/api/predict/" payload = json.dumps({ "data": [f"data:image/jpeg;base64,{base64_str}"], "dataType": ["image"] }) headers = { 'Content-Type': 'application/json' } response = requests.request("POST", url, headers=headers, data=payload) jobj = json.loads(response.text) return jobj if __name__ == '__main__': # 获取当前目录的子目录的路径 img_path = 'manga' subdir_path = os.path.join(os.getcwd(), img_path) # 图片素材获取(包含子目录下所有图片) image_files = [] for root, dirs, files in os.walk(subdir_path): for file in files: if file.endswith(".jpg") or file.endswith(".png"): image_files.append(os.path.relpath(os.path.join(root, file))) for image_path in image_files: result = ocr(image_path) ##dataframe格式 有两列 boxid 和text if 'error' in result: print("发现问题图片,需要删除以过审:",image_path) os.remove(image_path) else: print(image_path, "图片没有问题")