Ki-Seki commited on
Commit
bc989cb
Β·
1 Parent(s): 72d3cb0

chore: update

Browse files
Files changed (3) hide show
  1. app.py +33 -16
  2. autotab.py +49 -21
  3. requirements.txt +1 -0
app.py CHANGED
@@ -1,31 +1,41 @@
 
 
 
1
  import gradio as gr
 
2
 
3
  from autotab import AutoTab
4
- import json
5
 
6
 
7
  def auto_tabulator_completion(
8
- in_file,
9
- instruction,
10
- max_examples,
11
- model_name,
12
- generation_config,
13
- save_every,
14
- ):
 
 
 
15
  output_file_name = "ouput.xlsx"
16
  autotab = AutoTab(
17
- in_file_path=in_file.name,
18
- instruction=instruction,
19
  out_file_path=output_file_name,
 
20
  max_examples=max_examples,
21
  model_name=model_name,
22
- api_key="sk-exhahhjfqyanmwewndukcqtrpegfdbwszkjucvcpajdufiah",
23
- base_url="https://public-beta-api.siliconflow.cn/v1",
24
  generation_config=json.loads(generation_config),
 
25
  save_every=save_every,
 
 
26
  )
 
27
  autotab.run()
28
- return output_file_name, autotab.data[:15]
 
 
29
 
30
 
31
  # Gradio interface
@@ -35,17 +45,24 @@ inputs = [
35
  value="You are a helpful assistant. Help me finish the task.",
36
  label="Instruction",
37
  ),
38
- gr.Slider(value=5, minimum=1, maximum=100, label="Max Examples"),
39
  gr.Textbox(value="Qwen/Qwen2-7B-Instruct", label="Model Name"),
40
  gr.Textbox(
41
  value='{"temperature": 0, "max_tokens": 128}',
42
  label="Generation Config in Dict",
43
  ),
44
- gr.Slider(value=10, minimum=1, maximum=1000, label="Save Every N Steps"),
 
 
 
 
 
45
  ]
46
 
47
  outputs = [
 
48
  gr.File(label="Output Excel File"),
 
49
  gr.Dataframe(label="First 15 rows."),
50
  ]
51
 
@@ -54,5 +71,5 @@ gr.Interface(
54
  inputs=inputs,
55
  outputs=outputs,
56
  title="Auto Tabulator Completion",
57
- description="Automatically complete missing output values in tabular data based on in-context learning. Visit https://github.com/Ki-Seki/autotab for more information.",
58
  ).launch()
 
1
+ import json
2
+ import time
3
+
4
  import gradio as gr
5
+ import pandas as pd
6
 
7
  from autotab import AutoTab
 
8
 
9
 
10
  def auto_tabulator_completion(
11
+ in_file_path: str,
12
+ instruction: str,
13
+ max_examples: int,
14
+ model_name: str,
15
+ generation_config: dict,
16
+ request_interval: float,
17
+ save_every: int,
18
+ api_key: str,
19
+ base_url: str,
20
+ ) -> tuple[str, str, str, pd.DataFrame]:
21
  output_file_name = "ouput.xlsx"
22
  autotab = AutoTab(
23
+ in_file_path=in_file_path,
 
24
  out_file_path=output_file_name,
25
+ instruction=instruction,
26
  max_examples=max_examples,
27
  model_name=model_name,
 
 
28
  generation_config=json.loads(generation_config),
29
+ request_interval=request_interval,
30
  save_every=save_every,
31
+ api_key=api_key,
32
+ base_url=base_url,
33
  )
34
+ start = time.time()
35
  autotab.run()
36
+ time_taken = time.strftime("%H:%M:%S", time.gmtime(time.time() - start))
37
+
38
+ return time_taken, output_file_name, autotab.query_example, autotab.data[:15]
39
 
40
 
41
  # Gradio interface
 
45
  value="You are a helpful assistant. Help me finish the task.",
46
  label="Instruction",
47
  ),
48
+ gr.Slider(value=5, minimum=1, maximum=50, step=1, label="Max Examples"),
49
  gr.Textbox(value="Qwen/Qwen2-7B-Instruct", label="Model Name"),
50
  gr.Textbox(
51
  value='{"temperature": 0, "max_tokens": 128}',
52
  label="Generation Config in Dict",
53
  ),
54
+ gr.Slider(value=0.1, minimum=0, maximum=10, label="Request Interval in Seconds"),
55
+ gr.Slider(value=100, minimum=1, maximum=1000, step=1, label="Save Every N Steps"),
56
+ gr.Textbox(
57
+ value="sk-exhahhjfqyanmwewndukcqtrpegfdbwszkjucvcpajdufiah", label="API Key"
58
+ ),
59
+ gr.Textbox(value="https://public-beta-api.siliconflow.cn/v1", label="Base URL"),
60
  ]
61
 
62
  outputs = [
63
+ gr.Textbox(label="Time Taken"),
64
  gr.File(label="Output Excel File"),
65
+ gr.Textbox(label="Query Example"),
66
  gr.Dataframe(label="First 15 rows."),
67
  ]
68
 
 
71
  inputs=inputs,
72
  outputs=outputs,
73
  title="Auto Tabulator Completion",
74
+ description="Automatically complete missing output values in tabular data based on in-context learning.",
75
  ).launch()
autotab.py CHANGED
@@ -1,7 +1,10 @@
1
  import re
 
 
2
 
3
  import openai
4
  import pandas as pd
 
5
  from tqdm import tqdm
6
 
7
 
@@ -10,23 +13,25 @@ class AutoTab:
10
  self,
11
  in_file_path: str,
12
  out_file_path: str,
 
13
  max_examples: int,
14
  model_name: str,
15
- api_key: str,
16
- base_url: str,
17
  generation_config: dict,
 
18
  save_every: int,
19
- instruction: str,
 
20
  ):
21
  self.in_file_path = in_file_path
22
  self.out_file_path = out_file_path
 
23
  self.max_examples = max_examples
24
  self.model_name = model_name
25
- self.api_key = api_key
26
- self.base_url = base_url
27
  self.generation_config = generation_config
 
28
  self.save_every = save_every
29
- self.instruction = instruction
 
30
 
31
  # ─── IO ───────────────────────────────────────────────────────────────
32
 
@@ -39,8 +44,10 @@ class AutoTab:
39
 
40
  # ─── LLM ──────────────────────────────────────────────────────────────
41
 
 
42
  def openai_request(self, query: str) -> str:
43
  """Make a request to an OpenAI-format API."""
 
44
  client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
45
  response = client.chat.completions.create(
46
  model=self.model_name,
@@ -68,7 +75,6 @@ class AutoTab:
68
  for col in output_columns
69
  )
70
  in_context += "\n"
71
- self.in_context = in_context
72
  return in_context
73
 
74
  def predict_output(
@@ -101,19 +107,41 @@ class AutoTab:
101
 
102
  # ─── Engine ───────────────────────────────────────────────────────────
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  def run(self):
105
- data, input_fields, output_fields = self.load_excel()
106
- in_context = self.derive_incontext(data, input_fields, output_fields)
107
-
108
- num_existed_examples = len(data.dropna(subset=output_fields))
109
-
110
- for i in tqdm(range(num_existed_examples, len(data))):
111
- prediction = self.predict_output(in_context, data.iloc[i], input_fields)
112
- extracted_fields = self.extract_fields(prediction, output_fields)
113
- for field_name in output_fields:
114
- data.at[i, field_name] = extracted_fields.get(field_name, "")
115
- if i % self.save_every == 0:
116
- data.to_excel(self.out_file_path, index=False)
117
- self.data = data
118
- data.to_excel(self.out_file_path, index=False)
 
 
 
 
119
  print(f"Results saved to {self.out_file_path}")
 
1
  import re
2
+ import time
3
+ from concurrent.futures import ThreadPoolExecutor
4
 
5
  import openai
6
  import pandas as pd
7
+ from tenacity import retry, stop_after_attempt, wait_random_exponential
8
  from tqdm import tqdm
9
 
10
 
 
13
  self,
14
  in_file_path: str,
15
  out_file_path: str,
16
+ instruction: str,
17
  max_examples: int,
18
  model_name: str,
 
 
19
  generation_config: dict,
20
+ request_interval: float,
21
  save_every: int,
22
+ api_key: str,
23
+ base_url: str,
24
  ):
25
  self.in_file_path = in_file_path
26
  self.out_file_path = out_file_path
27
+ self.instruction = instruction
28
  self.max_examples = max_examples
29
  self.model_name = model_name
 
 
30
  self.generation_config = generation_config
31
+ self.request_interval = request_interval
32
  self.save_every = save_every
33
+ self.api_key = api_key
34
+ self.base_url = base_url
35
 
36
  # ─── IO ───────────────────────────────────────────────────────────────
37
 
 
44
 
45
  # ─── LLM ──────────────────────────────────────────────────────────────
46
 
47
+ @retry(wait=wait_random_exponential(min=20, max=60), stop=stop_after_attempt(6))
48
  def openai_request(self, query: str) -> str:
49
  """Make a request to an OpenAI-format API."""
50
+ time.sleep(self.request_interval)
51
  client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
52
  response = client.chat.completions.create(
53
  model=self.model_name,
 
75
  for col in output_columns
76
  )
77
  in_context += "\n"
 
78
  return in_context
79
 
80
  def predict_output(
 
107
 
108
  # ─── Engine ───────────────────────────────────────────────────────────
109
 
110
+ def _predict_and_extract(self, i: int) -> dict[str, str]:
111
+ """Helper function to predict and extract fields for a single row."""
112
+ prediction = self.predict_output(
113
+ self.in_context, self.data.iloc[i], self.input_fields
114
+ )
115
+ extracted_fields = self.extract_fields(prediction, self.output_fields)
116
+ return extracted_fields
117
+
118
+ def batch_prediction(self, start_index: int, end_index: int):
119
+ """Process a batch of predictions asynchronously."""
120
+ with ThreadPoolExecutor() as executor:
121
+ results = list(
122
+ executor.map(self._predict_and_extract, range(start_index, end_index))
123
+ )
124
+ for i, extracted_fields in zip(range(start_index, end_index), results):
125
+ for field_name in self.output_fields:
126
+ self.data.at[i, field_name] = extracted_fields.get(field_name, "")
127
+
128
  def run(self):
129
+ self.data, self.input_fields, self.output_fields = self.load_excel()
130
+ self.in_context = self.derive_incontext(
131
+ self.data, self.input_fields, self.output_fields
132
+ )
133
+
134
+ self.num_data = len(self.data)
135
+ self.num_examples = len(self.data.dropna(subset=self.output_fields))
136
+
137
+ tqdm_bar = tqdm(total=self.num_data - self.num_examples, leave=False)
138
+ for start in range(self.num_examples, self.num_data, self.save_every):
139
+ tqdm_bar.update(min(self.save_every, self.num_data - start))
140
+ end = min(start + self.save_every, self.num_data)
141
+ try:
142
+ self.batch_prediction(start, end)
143
+ except Exception as e:
144
+ print(e)
145
+ self.data.to_excel(self.out_file_path, index=False)
146
+ self.data.to_excel(self.out_file_path, index=False)
147
  print(f"Results saved to {self.out_file_path}")
requirements.txt CHANGED
@@ -3,3 +3,4 @@ openai
3
  argparse
4
  openpyxl
5
  gradio
 
 
3
  argparse
4
  openpyxl
5
  gradio
6
+ tenacity