Gabor Cselle
commited on
Commit
·
d0f419a
1
Parent(s):
5dd5081
gen_sampla_data script to generate sample text images
Browse files- .gitignore +1 -0
- gen_sample_data.py +61 -0
- requirements.txt +1 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
font_images
|
gen_sample_data.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Generate sample data with 800x400 images of fonts in /System/Library/Fonts
|
2 |
+
# 50 images per font, 1 font per image
|
3 |
+
|
4 |
+
|
5 |
+
import os
|
6 |
+
from PIL import Image, ImageDraw, ImageFont
|
7 |
+
import nltk
|
8 |
+
from nltk.corpus import brown
|
9 |
+
import random
|
10 |
+
|
11 |
+
# Download the necessary data from nltk
|
12 |
+
nltk.download('brown')
|
13 |
+
|
14 |
+
# Sample text for prose and code
|
15 |
+
prose_text = " ".join(brown.words(categories='news')[:50]) # First 50 words from news category
|
16 |
+
|
17 |
+
font_dir = '/System/Library/Fonts/'
|
18 |
+
output_dir = './font_images'
|
19 |
+
os.makedirs(output_dir, exist_ok=True)
|
20 |
+
|
21 |
+
all_brown_words = sorted(set(brown.words(categories='news')))
|
22 |
+
|
23 |
+
def wrap_text(text, line_length=10):
|
24 |
+
"""
|
25 |
+
Wraps the provided text every 'line_length' words.
|
26 |
+
"""
|
27 |
+
words = text.split()
|
28 |
+
return "\n".join([" ".join(words[i:i+line_length]) for i in range(0, len(words), line_length)])
|
29 |
+
|
30 |
+
def random_prose_text(words, num_words=200): # Sample random words
|
31 |
+
random_words = " ".join(random.sample(words, num_words))
|
32 |
+
return wrap_text(random_words)
|
33 |
+
|
34 |
+
def random_code_text(base_code, num_lines=15): # Increase number of lines
|
35 |
+
lines = base_code.split("\n")
|
36 |
+
return "\n".join(random.sample(lines, min(num_lines, len(lines))))
|
37 |
+
|
38 |
+
for font_file in os.listdir(font_dir):
|
39 |
+
if font_file.endswith('.ttf'):
|
40 |
+
font_path = os.path.join(font_dir, font_file)
|
41 |
+
font_name = font_file.split('.')[0]
|
42 |
+
print(font_name)
|
43 |
+
|
44 |
+
j = 0
|
45 |
+
for i in range(50): # Generate 50 images per font
|
46 |
+
prose_sample = random_prose_text(all_brown_words)
|
47 |
+
|
48 |
+
for text in [prose_sample]:
|
49 |
+
img = Image.new('RGB', (800, 400), color="white") # Canvas size
|
50 |
+
draw = ImageDraw.Draw(img)
|
51 |
+
font_size = random.choice(range(32, 128)) # Increased minimum font size
|
52 |
+
font = ImageFont.truetype(font_path, font_size)
|
53 |
+
|
54 |
+
# Random offsets, but ensuring that text isn't too far off the canvas
|
55 |
+
offset_x = random.randint(-20, 10)
|
56 |
+
offset_y = random.randint(-20, 10)
|
57 |
+
draw.text((offset_x, offset_y), text, fill="black", font=font)
|
58 |
+
|
59 |
+
j += 1
|
60 |
+
output_file = os.path.join(output_dir, f"{font_name}_{j}.png")
|
61 |
+
img.save(output_file)
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Pillow==9.5.0
|