Spaces:

Ki-Seki
/

AutoTab

Sleeping

App Files Files Community

Ki-Seki commited on Jul 27, 2024

Commit

bc989cb

1 Parent(s): 72d3cb0

chore: update

Browse files

Files changed (3) hide show

app.py +33 -16
autotab.py +49 -21
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,31 +1,41 @@
 import gradio as gr
 from autotab import AutoTab
-import json
 def auto_tabulator_completion(
-    in_file,
-    instruction,
-    max_examples,
-    model_name,
-    generation_config,
-    save_every,
-):
     output_file_name = "ouput.xlsx"
     autotab = AutoTab(
-        in_file_path=in_file.name,
-        instruction=instruction,
         out_file_path=output_file_name,
         max_examples=max_examples,
         model_name=model_name,
-        api_key="sk-exhahhjfqyanmwewndukcqtrpegfdbwszkjucvcpajdufiah",
-        base_url="https://public-beta-api.siliconflow.cn/v1",
         generation_config=json.loads(generation_config),
         save_every=save_every,
     )
     autotab.run()
-    return output_file_name, autotab.data[:15]
 # Gradio interface
@@ -35,17 +45,24 @@ inputs = [
         value="You are a helpful assistant. Help me finish the task.",
         label="Instruction",
     ),
-    gr.Slider(value=5, minimum=1, maximum=100, label="Max Examples"),
     gr.Textbox(value="Qwen/Qwen2-7B-Instruct", label="Model Name"),
     gr.Textbox(
         value='{"temperature": 0, "max_tokens": 128}',
         label="Generation Config in Dict",
     ),
-    gr.Slider(value=10, minimum=1, maximum=1000, label="Save Every N Steps"),
 ]
 outputs = [
     gr.File(label="Output Excel File"),
     gr.Dataframe(label="First 15 rows."),
 ]
@@ -54,5 +71,5 @@ gr.Interface(
     inputs=inputs,
     outputs=outputs,
     title="Auto Tabulator Completion",
-    description="Automatically complete missing output values in tabular data based on in-context learning. Visit https://github.com/Ki-Seki/autotab for more information.",
 ).launch()

+import json
+import time
 import gradio as gr
+import pandas as pd
 from autotab import AutoTab
 def auto_tabulator_completion(
+    in_file_path: str,
+    instruction: str,
+    max_examples: int,
+    model_name: str,
+    generation_config: dict,
+    request_interval: float,
+    save_every: int,
+    api_key: str,
+    base_url: str,
+) -> tuple[str, str, str, pd.DataFrame]:
     output_file_name = "ouput.xlsx"
     autotab = AutoTab(
+        in_file_path=in_file_path,
         out_file_path=output_file_name,
+        instruction=instruction,
         max_examples=max_examples,
         model_name=model_name,
         generation_config=json.loads(generation_config),
+        request_interval=request_interval,
         save_every=save_every,
+        api_key=api_key,
+        base_url=base_url,
     )
+    start = time.time()
     autotab.run()
+    time_taken = time.strftime("%H:%M:%S", time.gmtime(time.time() - start))
+    return time_taken, output_file_name, autotab.query_example, autotab.data[:15]
 # Gradio interface
         value="You are a helpful assistant. Help me finish the task.",
         label="Instruction",
     ),
+    gr.Slider(value=5, minimum=1, maximum=50, step=1, label="Max Examples"),
     gr.Textbox(value="Qwen/Qwen2-7B-Instruct", label="Model Name"),
     gr.Textbox(
         value='{"temperature": 0, "max_tokens": 128}',
         label="Generation Config in Dict",
     ),
+    gr.Slider(value=0.1, minimum=0, maximum=10, label="Request Interval in Seconds"),
+    gr.Slider(value=100, minimum=1, maximum=1000, step=1, label="Save Every N Steps"),
+    gr.Textbox(
+        value="sk-exhahhjfqyanmwewndukcqtrpegfdbwszkjucvcpajdufiah", label="API Key"
+    ),
+    gr.Textbox(value="https://public-beta-api.siliconflow.cn/v1", label="Base URL"),
 ]
 outputs = [
+    gr.Textbox(label="Time Taken"),
     gr.File(label="Output Excel File"),
+    gr.Textbox(label="Query Example"),
     gr.Dataframe(label="First 15 rows."),
 ]
     inputs=inputs,
     outputs=outputs,
     title="Auto Tabulator Completion",
+    description="Automatically complete missing output values in tabular data based on in-context learning.",
 ).launch()

autotab.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import re
 import openai
 import pandas as pd
 from tqdm import tqdm
@@ -10,23 +13,25 @@ class AutoTab:
         self,
         in_file_path: str,
         out_file_path: str,
         max_examples: int,
         model_name: str,
-        api_key: str,
-        base_url: str,
         generation_config: dict,
         save_every: int,
-        instruction: str,
     ):
         self.in_file_path = in_file_path
         self.out_file_path = out_file_path
         self.max_examples = max_examples
         self.model_name = model_name
-        self.api_key = api_key
-        self.base_url = base_url
         self.generation_config = generation_config
         self.save_every = save_every
-        self.instruction = instruction
     # ─── IO ───────────────────────────────────────────────────────────────
@@ -39,8 +44,10 @@ class AutoTab:
     # ─── LLM ──────────────────────────────────────────────────────────────
     def openai_request(self, query: str) -> str:
         """Make a request to an OpenAI-format API."""
         client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
         response = client.chat.completions.create(
             model=self.model_name,
@@ -68,7 +75,6 @@ class AutoTab:
                 for col in output_columns
             )
             in_context += "\n"
-        self.in_context = in_context
         return in_context
     def predict_output(
@@ -101,19 +107,41 @@ class AutoTab:
     # ─── Engine ───────────────────────────────────────────────────────────
     def run(self):
-        data, input_fields, output_fields = self.load_excel()
-        in_context = self.derive_incontext(data, input_fields, output_fields)
-        num_existed_examples = len(data.dropna(subset=output_fields))
-        for i in tqdm(range(num_existed_examples, len(data))):
-            prediction = self.predict_output(in_context, data.iloc[i], input_fields)
-            extracted_fields = self.extract_fields(prediction, output_fields)
-            for field_name in output_fields:
-                data.at[i, field_name] = extracted_fields.get(field_name, "")
-            if i % self.save_every == 0:
-                data.to_excel(self.out_file_path, index=False)
-        self.data = data
-        data.to_excel(self.out_file_path, index=False)
         print(f"Results saved to {self.out_file_path}")

 import re
+import time
+from concurrent.futures import ThreadPoolExecutor
 import openai
 import pandas as pd
+from tenacity import retry, stop_after_attempt, wait_random_exponential
 from tqdm import tqdm
         self,
         in_file_path: str,
         out_file_path: str,
+        instruction: str,
         max_examples: int,
         model_name: str,
         generation_config: dict,
+        request_interval: float,
         save_every: int,
+        api_key: str,
+        base_url: str,
     ):
         self.in_file_path = in_file_path
         self.out_file_path = out_file_path
+        self.instruction = instruction
         self.max_examples = max_examples
         self.model_name = model_name
         self.generation_config = generation_config
+        self.request_interval = request_interval
         self.save_every = save_every
+        self.api_key = api_key
+        self.base_url = base_url
     # ─── IO ───────────────────────────────────────────────────────────────
     # ─── LLM ──────────────────────────────────────────────────────────────
+    @retry(wait=wait_random_exponential(min=20, max=60), stop=stop_after_attempt(6))
     def openai_request(self, query: str) -> str:
         """Make a request to an OpenAI-format API."""
+        time.sleep(self.request_interval)
         client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
         response = client.chat.completions.create(
             model=self.model_name,
                 for col in output_columns
             )
             in_context += "\n"
         return in_context
     def predict_output(
     # ─── Engine ───────────────────────────────────────────────────────────
+    def _predict_and_extract(self, i: int) -> dict[str, str]:
+        """Helper function to predict and extract fields for a single row."""
+        prediction = self.predict_output(
+            self.in_context, self.data.iloc[i], self.input_fields
+        )
+        extracted_fields = self.extract_fields(prediction, self.output_fields)
+        return extracted_fields
+    def batch_prediction(self, start_index: int, end_index: int):
+        """Process a batch of predictions asynchronously."""
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._predict_and_extract, range(start_index, end_index))
+            )
+        for i, extracted_fields in zip(range(start_index, end_index), results):
+            for field_name in self.output_fields:
+                self.data.at[i, field_name] = extracted_fields.get(field_name, "")
     def run(self):
+        self.data, self.input_fields, self.output_fields = self.load_excel()
+        self.in_context = self.derive_incontext(
+            self.data, self.input_fields, self.output_fields
+        )
+        self.num_data = len(self.data)
+        self.num_examples = len(self.data.dropna(subset=self.output_fields))
+        tqdm_bar = tqdm(total=self.num_data - self.num_examples, leave=False)
+        for start in range(self.num_examples, self.num_data, self.save_every):
+            tqdm_bar.update(min(self.save_every, self.num_data - start))
+            end = min(start + self.save_every, self.num_data)
+            try:
+                self.batch_prediction(start, end)
+            except Exception as e:
+                print(e)
+            self.data.to_excel(self.out_file_path, index=False)
+        self.data.to_excel(self.out_file_path, index=False)
         print(f"Results saved to {self.out_file_path}")

requirements.txt CHANGED Viewed

@@ -3,3 +3,4 @@ openai
 argparse
 openpyxl
 gradio

 argparse
 openpyxl
 gradio
+tenacity