Gabor Cselle commited on
Commit
d0f419a
·
1 Parent(s): 5dd5081

gen_sampla_data script to generate sample text images

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. gen_sample_data.py +61 -0
  3. requirements.txt +1 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ font_images
gen_sample_data.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generate sample data with 800x400 images of fonts in /System/Library/Fonts
2
+ # 50 images per font, 1 font per image
3
+
4
+
5
+ import os
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ import nltk
8
+ from nltk.corpus import brown
9
+ import random
10
+
11
+ # Download the necessary data from nltk
12
+ nltk.download('brown')
13
+
14
+ # Sample text for prose and code
15
+ prose_text = " ".join(brown.words(categories='news')[:50]) # First 50 words from news category
16
+
17
+ font_dir = '/System/Library/Fonts/'
18
+ output_dir = './font_images'
19
+ os.makedirs(output_dir, exist_ok=True)
20
+
21
+ all_brown_words = sorted(set(brown.words(categories='news')))
22
+
23
+ def wrap_text(text, line_length=10):
24
+ """
25
+ Wraps the provided text every 'line_length' words.
26
+ """
27
+ words = text.split()
28
+ return "\n".join([" ".join(words[i:i+line_length]) for i in range(0, len(words), line_length)])
29
+
30
+ def random_prose_text(words, num_words=200): # Sample random words
31
+ random_words = " ".join(random.sample(words, num_words))
32
+ return wrap_text(random_words)
33
+
34
+ def random_code_text(base_code, num_lines=15): # Increase number of lines
35
+ lines = base_code.split("\n")
36
+ return "\n".join(random.sample(lines, min(num_lines, len(lines))))
37
+
38
+ for font_file in os.listdir(font_dir):
39
+ if font_file.endswith('.ttf'):
40
+ font_path = os.path.join(font_dir, font_file)
41
+ font_name = font_file.split('.')[0]
42
+ print(font_name)
43
+
44
+ j = 0
45
+ for i in range(50): # Generate 50 images per font
46
+ prose_sample = random_prose_text(all_brown_words)
47
+
48
+ for text in [prose_sample]:
49
+ img = Image.new('RGB', (800, 400), color="white") # Canvas size
50
+ draw = ImageDraw.Draw(img)
51
+ font_size = random.choice(range(32, 128)) # Increased minimum font size
52
+ font = ImageFont.truetype(font_path, font_size)
53
+
54
+ # Random offsets, but ensuring that text isn't too far off the canvas
55
+ offset_x = random.randint(-20, 10)
56
+ offset_y = random.randint(-20, 10)
57
+ draw.text((offset_x, offset_y), text, fill="black", font=font)
58
+
59
+ j += 1
60
+ output_file = os.path.join(output_dir, f"{font_name}_{j}.png")
61
+ img.save(output_file)
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Pillow==9.5.0