baqu2213 commited on
Commit
ad7e448
ยท
verified ยท
1 Parent(s): 254e753

Upload 2 files

Browse files
.gitattributes CHANGED
@@ -152,3 +152,4 @@ NAIA/Beta/NAIA[[:space:]]v1.12[[:space:]](testv3).exe filter=lfs diff=lfs merge=
152
  NAIA/Beta/NAIA[[:space:]]v1.12.exe filter=lfs diff=lfs merge=lfs -text
153
  NAIA/Beta/NAIA[[:space:]]v1.13[[:space:]]testv1.exe filter=lfs diff=lfs merge=lfs -text
154
  NAIA/Beta/NAIA[[:space:]]v1.13[[:space:]]testv2.exe filter=lfs diff=lfs merge=lfs -text
 
 
152
  NAIA/Beta/NAIA[[:space:]]v1.12.exe filter=lfs diff=lfs merge=lfs -text
153
  NAIA/Beta/NAIA[[:space:]]v1.13[[:space:]]testv1.exe filter=lfs diff=lfs merge=lfs -text
154
  NAIA/Beta/NAIA[[:space:]]v1.13[[:space:]]testv2.exe filter=lfs diff=lfs merge=lfs -text
155
+ NAIA/Beta/parquet_token_update_tool.exe filter=lfs diff=lfs merge=lfs -text
NAIA/Beta/parquet_token_update_tool.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:957e920ef0ceafd91952f3b761585da854be3265878e82f01952a2f5c4b3484c
3
+ size 259772306
NAIA/Beta/parquet_token_update_tool.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from transformers import CLIPTokenizer
4
+ import tkinter as tk
5
+ from tkinter import filedialog, scrolledtext
6
+ import threading
7
+
8
+ # CLIPTokenizer ์ดˆ๊ธฐํ™”
9
+ s_token = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
10
+
11
+ def rcs(text):
12
+ if text is None:
13
+ return None
14
+ token_ids = s_token.encode(text)
15
+ return len(token_ids)
16
+
17
+ def process_files(dflist, text_box, select_button, process_button):
18
+ # /tags ํ•˜์œ„ ํด๋” ๋งŒ๋“ค๊ธฐ
19
+ output_dir = "processed"
20
+ os.makedirs(output_dir, exist_ok=True)
21
+
22
+ for i, _df in enumerate(dflist):
23
+ # parquet ํŒŒ์ผ ์ฝ๊ธฐ
24
+ df = pd.read_parquet(_df, engine="pyarrow")
25
+
26
+ # 'tokens' ์—ด ์ถ”๊ฐ€
27
+ tokens = []
28
+ total = len(df)
29
+ for idx, text in enumerate(df['general']):
30
+ if text is not None:
31
+ tokens.append(rcs(text))
32
+
33
+ # ์ง„ํ–‰ ์ƒํ™ฉ ์ถœ๋ ฅ
34
+ if (idx + 1) % 100 == 0 or idx + 1 == total:
35
+ progress = f"Processing file {_df}: {idx + 1}/{total} ({(idx + 1) / total * 100:.2f}%)\n"
36
+ text_box.insert(tk.END, progress)
37
+ text_box.see(tk.END)
38
+ else:
39
+ tokens.append(None)
40
+
41
+ df['tokens'] = tokens
42
+
43
+ # ์ฒ˜๋ฆฌ๋œ ํŒŒ์ผ ์ €์žฅ ๊ฒฝ๋กœ
44
+ output_path = os.path.join(output_dir, os.path.basename(_df))
45
+
46
+ # parquet ํŒŒ์ผ๋กœ ์ €์žฅ
47
+ df.to_parquet(output_path, engine="pyarrow")
48
+ text_box.insert(tk.END, f"Finished processing {_df}\n")
49
+ text_box.see(tk.END)
50
+
51
+ text_box.insert(tk.END, "๋ชจ๋“  ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ์ฒ˜๋ฆฌ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.\n")
52
+ text_box.see(tk.END)
53
+
54
+ # ์ž‘์—…์ด ๋ชจ๋‘ ์ข…๋ฃŒ๋˜๋ฉด output_dir ์œˆ๋„์šฐ ํด๋”๊ฐ€ ์—ด๋ฆฐ๋‹ค.
55
+ os.startfile(output_dir)
56
+
57
+ # dflist ์ดˆ๊ธฐํ™”
58
+ dflist = []
59
+
60
+ # ๋ฒ„ํŠผ ๋‹ค์‹œ ํ™œ์„ฑํ™”
61
+ select_button.config(state=tk.NORMAL)
62
+ process_button.config(state=tk.NORMAL)
63
+
64
+ def select_files():
65
+ file_paths = filedialog.askopenfilenames(filetypes=[("Parquet files", "*.parquet")])
66
+ if file_paths:
67
+ dflist.extend(file_paths)
68
+ text_box.insert(tk.END, f"Selected files:\n{file_paths}\n")
69
+ text_box.see(tk.END)
70
+
71
+ def start_processing():
72
+ if not dflist:
73
+ return
74
+ select_button.config(state=tk.DISABLED)
75
+ process_button.config(state=tk.DISABLED)
76
+
77
+ # ํŒŒ์ผ ์ฒ˜๋ฆฌ ์Šค๋ ˆ๋“œ ์‹œ์ž‘
78
+ threading.Thread(target=process_files, args=(dflist, text_box, select_button, process_button)).start()
79
+
80
+ # Tkinter UI ์„ค์ •
81
+ root = tk.Tk()
82
+ root.title("ํ”„๋กฌํ”„ํŠธ ์Šคํƒœ์ปค์šฉ parquet ํŒŒ์ผ ํ† ํฐ ์—…๋ฐ์ดํŠธ ๋„๊ตฌ")
83
+
84
+ frame = tk.Frame(root)
85
+ frame.pack(padx=10, pady=10)
86
+
87
+ select_button = tk.Button(frame, text="Parquet ํŒŒ์ผ ์„ ํƒ", command=select_files)
88
+ select_button.pack(side=tk.LEFT, padx=5, pady=5)
89
+
90
+ process_button = tk.Button(frame, text="ํ† ํฐ ๊ณ„์‚ฐ ์‹œ์ž‘", command=start_processing)
91
+ process_button.pack(side=tk.LEFT, padx=5, pady=5)
92
+
93
+ text_box = scrolledtext.ScrolledText(root, width=80, height=20)
94
+ text_box.pack(padx=10, pady=10)
95
+
96
+ # dflist ์ดˆ๊ธฐํ™”
97
+ dflist = []
98
+
99
+ root.mainloop()