Phi-3-HornyVision-128k-instruct / new_captioner.py

Upload new_captioner.py

bf4ab95 verified 3 months ago

3.76 kB

	import base64
	import requests
	import os
	from openai import OpenAI
	from tqdm import tqdm
	import time
	import sys

	# Проверка наличия аргумента командной строки
	if len(sys.argv) < 2:
	print("Please, provide the path to image folder.")
	sys.exit(1)

	# Get the path to image dir from command line.
	image_dir = sys.argv[1]

	openai_api_key = "EMPTY"
	openai_api_base = "http://localhost:8000/v1"
	client = OpenAI(
	api_key=openai_api_key,
	base_url=openai_api_base,
	)

	model_type = client.models.list().data[0].id
	print(f'model_type: {model_type}')

	# Function to encode the image
	def encode_image(image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode('utf-8')

	# Directories
	#dir with tags captions from wd tagger
	txt_dir = './txt/'
	#dir with result captions
	maintxt_dir = './maintxt/'
	image_path =''

	# Ensure the output directory exists
	os.makedirs(maintxt_dir, exist_ok=True)

	# Get list of all JPEG images in the directory
	image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.jpg', '.jpeg'))]

	total_files = len(image_files)
	start_time = time.time()

	progress_bar = tqdm(total=total_files, unit='file', bar_format='{l_bar}{bar}\| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]')
	total_elapsed_time = 0
	processed_files = 0

	# Process all images in the image directory
	for image_file in image_files:
	image_path = os.path.join(image_dir, image_file)
	txt_file = os.path.join(txt_dir, os.path.splitext(image_file)[0] + '.txt')
	output_file = os.path.join(maintxt_dir, os.path.splitext(image_file)[0] + '.txt')

	# Read tags from the corresponding txt file
	with open(txt_file, 'r') as f:
	tags = f.read().strip()

	base64_image = encode_image(image_path)

	step_start_time = time.time()

	chat_response = client.chat.completions.create(
	model="./phi3_v14_800-merged",
	messages=[{
	"role": "user",
	"content": [
	{"type": "text", "text": f"Make a caption that describe this image. Here is the tags for this image: {tags}"},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{base64_image}"
	},
	},
	],
	}],
	extra_body={'repetition_penalty': 1.05, 'top_k': -1,'top_p': 1,'temperature': 0, 'use_beam_search': True, 'best_of':5},
	)

	step_end_time = time.time()
	step_time = step_end_time - step_start_time
	total_elapsed_time += step_time
	remaining_time = (total_elapsed_time / (processed_files + 1)) * (total_files - processed_files - 1)

	# Convert remaining time to hours, minutes and seconds
	remaining_hours = int(remaining_time // 3600)
	remaining_minutes = int((remaining_time % 3600) // 60)
	remaining_seconds = int(remaining_time % 60)

	# Extract the content from the response
	content = chat_response.choices[0].message.content
	content = content.lstrip()
	# Write the content to the output file
	with open(output_file, 'w', encoding='utf-8') as f:
	f.write(content)

	print(f"\n\nFile {image_file}\nProcessing time: {step_time:.2f} seconds\n{content}")
	print(f"Response saved to file: {output_file}")

	processed_files += 1
	progress_bar.update(1)
	progress_bar.set_postfix(remaining=f'{remaining_hours:02d}:{remaining_minutes:02d}:{remaining_seconds:02d}', refresh=True)

	progress_bar.close()
	print("All images processed.")
	print(f"Total time: {time.time() - start_time:.2f} seconds")