Spaces:
Runtime error
Runtime error
suneeln-duke
commited on
Commit
•
6c57304
1
Parent(s):
cc84091
- .gitignore +1 -0
- Dockerfile +23 -0
- data/dune.pdf +0 -0
- main.py +56 -0
- models/dec_clf/nlp.h5 +3 -0
- models/dec_clf/tokenizer.pkl +3 -0
- requirements.txt +18 -0
- scripts/decision_clf/__pycache__/falcon_clf.cpython-311.pyc +0 -0
- scripts/decision_clf/__pycache__/rag_clf.cpython-311.pyc +0 -0
- scripts/decision_clf/__pycache__/seq_clf.cpython-311.pyc +0 -0
- scripts/decision_clf/falcon_clf.py +291 -0
- scripts/decision_clf/rag_clf.py +116 -0
- scripts/decision_clf/seq_clf.py +172 -0
- scripts/path_gen/paths_gen.py +94 -0
- scripts/summarization/__pycache__/falcon_summ.cpython-311.pyc +0 -0
- scripts/summarization/__pycache__/rag_summ.cpython-311.pyc +0 -0
- scripts/summarization/__pycache__/t5_summ.cpython-311.pyc +0 -0
- scripts/summarization/falcon_summ.py +12 -0
- scripts/summarization/graph_summ.py +0 -0
- scripts/summarization/rag_summ.py +103 -0
- scripts/summarization/t5_summ.py +109 -0
- scripts/text_gen/story_gen.py +87 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.env
|
Dockerfile
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11.2
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
+
|
9 |
+
RUN apt update && apt install -y ffmpeg
|
10 |
+
|
11 |
+
RUN useradd -m -u 1000 user
|
12 |
+
USER user
|
13 |
+
ENV HOME=/home/user \
|
14 |
+
PATH=/home/user/.local/bin:$PATH
|
15 |
+
|
16 |
+
WORKDIR $HOME/app
|
17 |
+
|
18 |
+
COPY --chown=user . $HOME/app
|
19 |
+
|
20 |
+
ENV H2O_WAVE_LISTEN=":7860"
|
21 |
+
ENV H2O_WAVE_ADDRESS="http://127.0.0.1:7860"
|
22 |
+
|
23 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
data/dune.pdf
ADDED
Binary file (333 kB). View file
|
|
main.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
|
3 |
+
import os
|
4 |
+
|
5 |
+
from langchain.document_loaders import DirectoryLoader
|
6 |
+
|
7 |
+
import wandb
|
8 |
+
|
9 |
+
import huggingface_hub
|
10 |
+
|
11 |
+
from scripts.summarization import falcon_summ
|
12 |
+
|
13 |
+
from scripts.decision_clf import seq_clf
|
14 |
+
|
15 |
+
from scripts.path_gen import paths_gen
|
16 |
+
|
17 |
+
from scripts.text_gen import story_gen
|
18 |
+
|
19 |
+
app = FastAPI()
|
20 |
+
|
21 |
+
wandb.login(key = os.getenv('wandb_key'))
|
22 |
+
|
23 |
+
huggingface_hub.login(token = os.getenv('hf_key'))
|
24 |
+
|
25 |
+
os.environ['OPENAI_API_KEY'] = os.getenv('openapi_key')
|
26 |
+
|
27 |
+
summarizer = falcon_summ.prep_pipeline()
|
28 |
+
|
29 |
+
token_path= 'models/dec_clf/tokenizer.pkl'
|
30 |
+
|
31 |
+
model_path = 'models/dec_clf/nlp.h5'
|
32 |
+
|
33 |
+
chunks = paths_gen.get_chunks("data/dune.pdf")
|
34 |
+
|
35 |
+
db = paths_gen.get_vectordb(chunks)
|
36 |
+
|
37 |
+
@app.get("/hello")
|
38 |
+
def hello():
|
39 |
+
return {"message": "Hello World"}
|
40 |
+
|
41 |
+
@app.post("/summ")
|
42 |
+
def summ(text: str):
|
43 |
+
return { "summary": falcon_summ.gen_summary(summarizer, text)}
|
44 |
+
|
45 |
+
@app.post("/clf")
|
46 |
+
def clf(text: str):
|
47 |
+
return {"decision": seq_clf.predict(text, model_path, token_path)}
|
48 |
+
|
49 |
+
@app.post("/gen_path")
|
50 |
+
def gen_path(text: str):
|
51 |
+
return paths_gen.gen_sample(text, db)
|
52 |
+
|
53 |
+
@app.post("/gen_story")
|
54 |
+
def gen_story(text: str, decision: str):
|
55 |
+
return story_gen.gen_sample(text, decision, db)
|
56 |
+
|
models/dec_clf/nlp.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:36e39f6a5bd037d9180031701950af7caedd3c27e44d68d6f4ee2361fbdc41df
|
3 |
+
size 8117880
|
models/dec_clf/tokenizer.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a432c203344b2bc0ac2c6fe4e0f50bf2003ea4b0b715f50a8a999a38907f11b8
|
3 |
+
size 307491
|
requirements.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn
|
3 |
+
pandas
|
4 |
+
chromadb==0.3.29
|
5 |
+
keras==2.13.1
|
6 |
+
tensorflow==2.13.0
|
7 |
+
torch==2.0.0
|
8 |
+
wandb
|
9 |
+
openapi
|
10 |
+
langchain==0.1.16
|
11 |
+
langchain-community==0.0.34
|
12 |
+
langchain-core==0.1.45
|
13 |
+
langchain-text-splitters==0.0.1
|
14 |
+
tiktoken
|
15 |
+
langchain
|
16 |
+
unstructured==0.13.3
|
17 |
+
transformers==4.36.1
|
18 |
+
scikit-learn==1.4.2
|
scripts/decision_clf/__pycache__/falcon_clf.cpython-311.pyc
ADDED
Binary file (12.3 kB). View file
|
|
scripts/decision_clf/__pycache__/rag_clf.cpython-311.pyc
ADDED
Binary file (4.78 kB). View file
|
|
scripts/decision_clf/__pycache__/seq_clf.cpython-311.pyc
ADDED
Binary file (8.12 kB). View file
|
|
scripts/decision_clf/falcon_clf.py
ADDED
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
import os
|
8 |
+
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
from transformers import pipeline
|
12 |
+
|
13 |
+
from transformers import AutoTokenizer, FalconForCausalLM
|
14 |
+
|
15 |
+
import torch
|
16 |
+
|
17 |
+
from datasets import Dataset
|
18 |
+
|
19 |
+
from peft import LoraConfig
|
20 |
+
|
21 |
+
from trl import SFTTrainer
|
22 |
+
|
23 |
+
from transformers import (
|
24 |
+
AutoTokenizer,
|
25 |
+
BitsAndBytesConfig,
|
26 |
+
TrainingArguments,
|
27 |
+
pipeline,
|
28 |
+
)
|
29 |
+
from sklearn.metrics import (accuracy_score,
|
30 |
+
classification_report,
|
31 |
+
confusion_matrix)
|
32 |
+
from sklearn.model_selection import train_test_split
|
33 |
+
|
34 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
35 |
+
|
36 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
37 |
+
|
38 |
+
import warnings
|
39 |
+
|
40 |
+
warnings.filterwarnings("ignore")
|
41 |
+
|
42 |
+
def generate_prompt(data_point):
|
43 |
+
return f"""### Instruction:
|
44 |
+
Classify whether the given chunk involves a decision that will effect the story or not.
|
45 |
+
A decision is defined as when the character goes about making a choice between two or more options.
|
46 |
+
The decision should be significant enough to affect the story in a major way.
|
47 |
+
It doesn't really involve emotions, feelings or thoughts, but what the character does, or what happens to them.
|
48 |
+
This involes interactions between characters, or the character and the environment.
|
49 |
+
What isn't a decision is chunks describing the setting, or the character's thoughts or feelings.
|
50 |
+
Return the answer as the corresponding decision label "yes" or "no"
|
51 |
+
|
52 |
+
### Text:
|
53 |
+
{data_point["text"]}
|
54 |
+
|
55 |
+
### Decision:
|
56 |
+
{data_point["decision"]}
|
57 |
+
"""
|
58 |
+
|
59 |
+
def generate_test_prompt(data_point):
|
60 |
+
return f"""### Instruction:
|
61 |
+
Classify whether the given chunk involves a decision that will effect the story or not.
|
62 |
+
A decision is defined as when the character goes about making a choice between two or more options.
|
63 |
+
The decision should be significant enough to affect the story in a major way.
|
64 |
+
It doesn't really involve emotions, feelings or thoughts, but what the character does, or what happens to them.
|
65 |
+
This involes interactions between characters, or the character and the environment.
|
66 |
+
What isn't a decision is chunks describing the setting, or the character's thoughts or feelings.
|
67 |
+
Return the answer as the corresponding decision label "yes" or "no"
|
68 |
+
|
69 |
+
### Text:
|
70 |
+
{data_point["text"]}
|
71 |
+
|
72 |
+
### Decision:
|
73 |
+
"""
|
74 |
+
|
75 |
+
def predict(X_test, model, tokenizer):
|
76 |
+
y_pred = []
|
77 |
+
for i in tqdm(range(len(X_test))):
|
78 |
+
prompt = X_test.iloc[i]["text"]
|
79 |
+
pipe = pipeline(task="text-generation",
|
80 |
+
model=model,
|
81 |
+
tokenizer=tokenizer,
|
82 |
+
max_new_tokens = 1,
|
83 |
+
temperature = 0.0,
|
84 |
+
)
|
85 |
+
result = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)
|
86 |
+
answer = result[0]['generated_text'].split("=")[-1].lower()
|
87 |
+
if "yes" in answer:
|
88 |
+
y_pred.append("yes")
|
89 |
+
elif "no" in answer:
|
90 |
+
y_pred.append("no")
|
91 |
+
else:
|
92 |
+
y_pred.append("none")
|
93 |
+
return y_pred
|
94 |
+
|
95 |
+
def evaluate(y_true, y_pred):
|
96 |
+
labels = ['yes', 'no', 'none']
|
97 |
+
mapping = {"yes": 1, "no": 0, 'none':2}
|
98 |
+
def map_func(x):
|
99 |
+
return mapping.get(x, 1)
|
100 |
+
|
101 |
+
y_true = np.vectorize(map_func)(y_true)
|
102 |
+
y_pred = np.vectorize(map_func)(y_pred)
|
103 |
+
|
104 |
+
# Calculate accuracy
|
105 |
+
accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
|
106 |
+
print(f'Accuracy: {accuracy:.3f}')
|
107 |
+
|
108 |
+
# Generate accuracy report
|
109 |
+
unique_labels = set(y_true) # Get unique labels
|
110 |
+
|
111 |
+
for label in unique_labels:
|
112 |
+
label_indices = [i for i in range(len(y_true))
|
113 |
+
if y_true[i] == label]
|
114 |
+
label_y_true = [y_true[i] for i in label_indices]
|
115 |
+
label_y_pred = [y_pred[i] for i in label_indices]
|
116 |
+
accuracy = accuracy_score(label_y_true, label_y_pred)
|
117 |
+
print(f'Accuracy for label {label}: {accuracy:.3f}')
|
118 |
+
|
119 |
+
# Generate classification report
|
120 |
+
class_report = classification_report(y_true=y_true, y_pred=y_pred)
|
121 |
+
print('\nClassification Report:')
|
122 |
+
print(class_report)
|
123 |
+
|
124 |
+
# Generate confusion matrix
|
125 |
+
conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
|
126 |
+
print('\nConfusion Matrix:')
|
127 |
+
print(conf_matrix)
|
128 |
+
|
129 |
+
|
130 |
+
def prep_data():
|
131 |
+
filename = '../../data/output/decisions.csv'
|
132 |
+
|
133 |
+
df = pd.read_csv(filename, encoding="utf-8", encoding_errors="replace")
|
134 |
+
|
135 |
+
df = df[['text', 'decision']]
|
136 |
+
|
137 |
+
X_train = list()
|
138 |
+
|
139 |
+
X_test = list()
|
140 |
+
|
141 |
+
for decision in ["yes", "no"]:
|
142 |
+
train, test = train_test_split(df[df.decision==decision],
|
143 |
+
train_size=.8,
|
144 |
+
test_size=.2,
|
145 |
+
random_state=42)
|
146 |
+
X_train.append(train)
|
147 |
+
X_test.append(test)
|
148 |
+
|
149 |
+
X_train = pd.concat(X_train).sample(frac=1, random_state=10)
|
150 |
+
|
151 |
+
X_test = pd.concat(X_test)
|
152 |
+
|
153 |
+
eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
|
154 |
+
|
155 |
+
X_eval = df[df.index.isin(eval_idx)]
|
156 |
+
|
157 |
+
X_eval = (X_eval
|
158 |
+
.groupby('decision', group_keys=False)
|
159 |
+
.apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
|
160 |
+
|
161 |
+
X_train = X_train.reset_index(drop=True)
|
162 |
+
|
163 |
+
X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1),
|
164 |
+
columns=["text"])
|
165 |
+
|
166 |
+
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1),
|
167 |
+
columns=["text"])
|
168 |
+
|
169 |
+
y_true = X_test.decision
|
170 |
+
|
171 |
+
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])
|
172 |
+
|
173 |
+
train_data = Dataset.from_pandas(X_train)
|
174 |
+
|
175 |
+
eval_data = Dataset.from_pandas(X_eval)
|
176 |
+
|
177 |
+
return train_data, eval_data
|
178 |
+
|
179 |
+
|
180 |
+
def prep_model():
|
181 |
+
model_name = "Rocketknight1/falcon-rw-1b"
|
182 |
+
|
183 |
+
compute_dtype = getattr(torch, "float16")
|
184 |
+
|
185 |
+
bnb_config = BitsAndBytesConfig(
|
186 |
+
load_in_4bit=True,
|
187 |
+
bnb_4bit_use_double_quant=False,
|
188 |
+
bnb_4bit_quant_type="nf4",
|
189 |
+
bnb_4bit_compute_dtype=compute_dtype,
|
190 |
+
)
|
191 |
+
|
192 |
+
model = FalconForCausalLM.from_pretrained(
|
193 |
+
model_name,
|
194 |
+
device_map="auto",
|
195 |
+
quantization_config=bnb_config,
|
196 |
+
)
|
197 |
+
|
198 |
+
model.config.use_cache = False
|
199 |
+
model.config.pretraining_tp = 1
|
200 |
+
|
201 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
202 |
+
trust_remote_code=True,
|
203 |
+
padding_side="left",
|
204 |
+
add_bos_token=True,
|
205 |
+
add_eos_token=True,
|
206 |
+
)
|
207 |
+
|
208 |
+
tokenizer.pad_token = tokenizer.eos_token
|
209 |
+
|
210 |
+
return model, tokenizer
|
211 |
+
|
212 |
+
|
213 |
+
def prep_trainer():
|
214 |
+
OUTPUT_DIR = "falcon-clf"
|
215 |
+
|
216 |
+
train_data, eval_data = prep_data()
|
217 |
+
|
218 |
+
model, tokenizer = prep_model()
|
219 |
+
|
220 |
+
peft_config = LoraConfig(
|
221 |
+
lora_alpha=16,
|
222 |
+
lora_dropout=0.1,
|
223 |
+
r=64,
|
224 |
+
bias="none",
|
225 |
+
task_type="CAUSAL_LM",
|
226 |
+
)
|
227 |
+
|
228 |
+
training_arguments = TrainingArguments(
|
229 |
+
output_dir=OUTPUT_DIR,
|
230 |
+
num_train_epochs=20,
|
231 |
+
per_device_train_batch_size=1,
|
232 |
+
gradient_accumulation_steps=8, # 4
|
233 |
+
optim="paged_adamw_32bit",
|
234 |
+
save_steps=0,
|
235 |
+
logging_steps=10,
|
236 |
+
learning_rate=2e-4,
|
237 |
+
weight_decay=0.001,
|
238 |
+
fp16=True,
|
239 |
+
bf16=False,
|
240 |
+
max_grad_norm=0.3,
|
241 |
+
max_steps=-1,
|
242 |
+
warmup_ratio=0.03,
|
243 |
+
group_by_length=True,
|
244 |
+
lr_scheduler_type="cosine",
|
245 |
+
report_to="tensorboard",
|
246 |
+
evaluation_strategy="epoch"
|
247 |
+
)
|
248 |
+
|
249 |
+
trainer = SFTTrainer(
|
250 |
+
model=model,
|
251 |
+
train_dataset=train_data,
|
252 |
+
eval_dataset=eval_data,
|
253 |
+
peft_config=peft_config,
|
254 |
+
dataset_text_field="text",
|
255 |
+
tokenizer=tokenizer,
|
256 |
+
args=training_arguments,
|
257 |
+
packing=False,
|
258 |
+
max_seq_length=1024,
|
259 |
+
)
|
260 |
+
|
261 |
+
return trainer
|
262 |
+
|
263 |
+
def train_model():
|
264 |
+
|
265 |
+
trainer = prep_trainer()
|
266 |
+
|
267 |
+
trainer.train()
|
268 |
+
|
269 |
+
trainer.model.save_pretrained("falcon-clf")
|
270 |
+
|
271 |
+
trainer.push_to_hub()
|
272 |
+
|
273 |
+
def get_classifier():
|
274 |
+
classifier = pipeline(model=f"suneeln-duke/falcon-clf", device_map="auto")
|
275 |
+
|
276 |
+
return classifier
|
277 |
+
|
278 |
+
def classify_dec(text, classifier):
|
279 |
+
|
280 |
+
text = generate_test_prompt({
|
281 |
+
'text': text
|
282 |
+
})
|
283 |
+
|
284 |
+
result = classifier(text, pad_token_id=classifier.tokenizer.eos_token_id)
|
285 |
+
|
286 |
+
answer = result[0]['generated_text'].split("=")[-1].lower()
|
287 |
+
|
288 |
+
if "yes" in answer:
|
289 |
+
return "yes"
|
290 |
+
elif "no" in answer:
|
291 |
+
return "no"
|
scripts/decision_clf/rag_clf.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
|
5 |
+
import openai
|
6 |
+
|
7 |
+
import langchain
|
8 |
+
|
9 |
+
import langchain.document_loaders
|
10 |
+
|
11 |
+
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
|
12 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
+
from langchain.schema import Document
|
14 |
+
from langchain.embeddings import OpenAIEmbeddings
|
15 |
+
from langchain.vectorstores.chroma import Chroma
|
16 |
+
import os
|
17 |
+
import shutil
|
18 |
+
|
19 |
+
from langchain.vectorstores.chroma import Chroma
|
20 |
+
from langchain.embeddings import OpenAIEmbeddings
|
21 |
+
from langchain.chat_models import ChatOpenAI
|
22 |
+
from langchain.prompts import ChatPromptTemplate
|
23 |
+
|
24 |
+
# read from config.ini file
|
25 |
+
|
26 |
+
|
27 |
+
import PyPDF2
|
28 |
+
|
29 |
+
def read_pages(pdf_file):
|
30 |
+
pages = []
|
31 |
+
|
32 |
+
reader = PyPDF2.PdfReader(pdf_file)
|
33 |
+
|
34 |
+
for page_number in range(len(reader.pages)):
|
35 |
+
|
36 |
+
page = reader.pages[page_number]
|
37 |
+
|
38 |
+
page_content = page.extract_text()
|
39 |
+
|
40 |
+
pages.append(page_content)
|
41 |
+
|
42 |
+
return pages
|
43 |
+
|
44 |
+
def get_chunks(file_path):
|
45 |
+
|
46 |
+
loader = PyPDFLoader(file_path)
|
47 |
+
|
48 |
+
documents = loader.load()
|
49 |
+
|
50 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
51 |
+
chunk_size=300,
|
52 |
+
chunk_overlap=100,
|
53 |
+
length_function=len,
|
54 |
+
add_start_index=True,
|
55 |
+
)
|
56 |
+
|
57 |
+
chunks = text_splitter.split_documents(documents)
|
58 |
+
|
59 |
+
return chunks
|
60 |
+
|
61 |
+
def get_vectordb(chunks, CHROMA_PATH):
|
62 |
+
|
63 |
+
CHROMA_PATH = f"../../data/chroma/{CHROMA_PATH}"
|
64 |
+
|
65 |
+
if os.path.exists(CHROMA_PATH):
|
66 |
+
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
|
67 |
+
|
68 |
+
else:
|
69 |
+
db = Chroma.from_documents(
|
70 |
+
chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
|
71 |
+
)
|
72 |
+
|
73 |
+
db.persist()
|
74 |
+
|
75 |
+
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
|
76 |
+
|
77 |
+
return db
|
78 |
+
|
79 |
+
def classify_dec(text, db):
|
80 |
+
|
81 |
+
PROMPT_TEMPLATE = """
|
82 |
+
Answer the question based only on the following context:
|
83 |
+
|
84 |
+
{context}
|
85 |
+
|
86 |
+
---
|
87 |
+
|
88 |
+
Answer the question based on the above context: {question}
|
89 |
+
"""
|
90 |
+
|
91 |
+
query_text = f"""
|
92 |
+
|
93 |
+
Classify whether the given chunk involves a decision that will effect the story or not.
|
94 |
+
|
95 |
+
A decision is defined as when the character goes about making a choice between two or more options.
|
96 |
+
The decision should be significant enough to affect the story in a major way.
|
97 |
+
It doesn't really involve emotions, feelings or thoughts, but what the character does, or what happens to them.
|
98 |
+
This involes interactions between characters, or the character and the environment.
|
99 |
+
What isn't a decision is chunks describing the setting, or the character's thoughts or feelings.
|
100 |
+
|
101 |
+
Return the answer as the corresponding decision label "yes" or "no"
|
102 |
+
|
103 |
+
{text}
|
104 |
+
|
105 |
+
"""
|
106 |
+
|
107 |
+
results = db.similarity_search_with_relevance_scores(query_text, k=5)
|
108 |
+
|
109 |
+
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
|
110 |
+
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
|
111 |
+
prompt = prompt_template.format(context=context_text, question=query_text)
|
112 |
+
|
113 |
+
model = ChatOpenAI()
|
114 |
+
response_text = model.predict(prompt)
|
115 |
+
|
116 |
+
return (response_text)
|
scripts/decision_clf/seq_clf.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
from tensorflow.keras.preprocessing.text import Tokenizer
|
6 |
+
|
7 |
+
from tensorflow.keras.optimizers import Adamax
|
8 |
+
|
9 |
+
from tensorflow.keras.metrics import Precision, Recall
|
10 |
+
|
11 |
+
from tensorflow.keras.layers import Dense, ReLU
|
12 |
+
|
13 |
+
from tensorflow.keras.layers import Embedding, BatchNormalization, Concatenate
|
14 |
+
|
15 |
+
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dropout
|
16 |
+
|
17 |
+
from tensorflow.keras.models import Sequential, Model
|
18 |
+
|
19 |
+
from sklearn.preprocessing import LabelEncoder
|
20 |
+
|
21 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
22 |
+
|
23 |
+
from keras.utils import to_categorical
|
24 |
+
|
25 |
+
from sklearn.model_selection import train_test_split
|
26 |
+
|
27 |
+
import pickle
|
28 |
+
|
29 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
30 |
+
|
31 |
+
import pickle
|
32 |
+
|
33 |
+
from tensorflow.keras.models import load_model
|
34 |
+
|
35 |
+
def prep_data():
|
36 |
+
|
37 |
+
# Assuming df is your DataFrame and you want to split based on 'col' column
|
38 |
+
# You can adjust the test_size and val_size to change the split proportions
|
39 |
+
train_size = 0.9
|
40 |
+
test_size = 0.05
|
41 |
+
val_size = 0.05
|
42 |
+
|
43 |
+
df = pd.read_csv('../../data/output/decisions.csv')
|
44 |
+
|
45 |
+
df = df[['text', 'decision']]
|
46 |
+
|
47 |
+
# First split into train and (test + val)
|
48 |
+
df, test_val_df = train_test_split(df, test_size=(test_size + val_size), random_state=42)
|
49 |
+
|
50 |
+
# Then split test_val_df into test and validation sets
|
51 |
+
test_df, val_df = train_test_split(test_val_df, test_size=val_size/(test_size + val_size), random_state=42)
|
52 |
+
|
53 |
+
return df, test_df, val_df
|
54 |
+
|
55 |
+
def split_data():
|
56 |
+
|
57 |
+
df, test_df, val_df = prep_data()
|
58 |
+
|
59 |
+
X_train = df['text']
|
60 |
+
y_train = df['decision']
|
61 |
+
|
62 |
+
X_test = test_df['text']
|
63 |
+
y_test = test_df['decision']
|
64 |
+
|
65 |
+
X_val = val_df['text']
|
66 |
+
y_val = val_df['decision']
|
67 |
+
|
68 |
+
encoder = LabelEncoder()
|
69 |
+
|
70 |
+
y_train = encoder.fit_transform(y_train)
|
71 |
+
|
72 |
+
y_val = encoder.transform(y_val)
|
73 |
+
|
74 |
+
y_test = encoder.transform(y_test)
|
75 |
+
|
76 |
+
mapping = dict(zip(encoder.classes_, range(len(encoder.classes_))))
|
77 |
+
|
78 |
+
return X_train, y_train, X_test, y_test, X_val, y_val, mapping
|
79 |
+
|
80 |
+
def prep_model():
|
81 |
+
|
82 |
+
max_words = 10000
|
83 |
+
|
84 |
+
max_len = 50
|
85 |
+
|
86 |
+
embedding_dim = 32
|
87 |
+
|
88 |
+
# Branch 1
|
89 |
+
branch1 = Sequential()
|
90 |
+
branch1.add(Embedding(max_words, embedding_dim, input_length=max_len))
|
91 |
+
branch1.add(Conv1D(64, 3, padding='same', activation='relu'))
|
92 |
+
branch1.add(BatchNormalization())
|
93 |
+
branch1.add(ReLU())
|
94 |
+
branch1.add(Dropout(0.5))
|
95 |
+
branch1.add(GlobalMaxPooling1D())
|
96 |
+
|
97 |
+
# Branch 2
|
98 |
+
branch2 = Sequential()
|
99 |
+
branch2.add(Embedding(max_words, embedding_dim, input_length=max_len))
|
100 |
+
branch2.add(Conv1D(64, 3, padding='same', activation='relu'))
|
101 |
+
branch2.add(BatchNormalization())
|
102 |
+
branch2.add(ReLU())
|
103 |
+
branch2.add(Dropout(0.5))
|
104 |
+
branch2.add(GlobalMaxPooling1D())
|
105 |
+
|
106 |
+
concatenated = Concatenate()([branch1.output, branch2.output])
|
107 |
+
|
108 |
+
hid_layer = Dense(128, activation='relu')(concatenated)
|
109 |
+
dropout = Dropout(0.3)(hid_layer)
|
110 |
+
output_layer = Dense(2, activation='softmax')(dropout)
|
111 |
+
|
112 |
+
model = Model(inputs=[branch1.input, branch2.input], outputs=output_layer)
|
113 |
+
|
114 |
+
model.compile(optimizer='adamax',
|
115 |
+
loss='binary_crossentropy',
|
116 |
+
metrics=['accuracy', Precision(), Recall()])
|
117 |
+
|
118 |
+
return model
|
119 |
+
|
120 |
+
def train_model():
|
121 |
+
|
122 |
+
X_train, y_train, X_test, y_test, X_val, y_val, mapping = split_data()
|
123 |
+
|
124 |
+
tokenizer = Tokenizer(num_words=10000)
|
125 |
+
tokenizer.fit_on_texts(X_train)
|
126 |
+
|
127 |
+
sequences = tokenizer.texts_to_sequences(X_train)
|
128 |
+
|
129 |
+
tr_x = pad_sequences(sequences, maxlen=50)
|
130 |
+
tr_y = to_categorical(y_train)
|
131 |
+
|
132 |
+
sequences = tokenizer.texts_to_sequences(X_val)
|
133 |
+
val_x = pad_sequences(sequences, maxlen=50)
|
134 |
+
val_y = to_categorical(y_val)
|
135 |
+
|
136 |
+
sequences = tokenizer.texts_to_sequences(X_test)
|
137 |
+
ts_x = pad_sequences(sequences, maxlen=50)
|
138 |
+
ts_y = to_categorical(y_test)
|
139 |
+
|
140 |
+
model = prep_model()
|
141 |
+
|
142 |
+
batch_size = 256
|
143 |
+
epochs = 100
|
144 |
+
history = model.fit([tr_x, tr_x], tr_y, epochs=epochs, batch_size=batch_size,
|
145 |
+
validation_data=([val_x, val_x], val_y))
|
146 |
+
|
147 |
+
|
148 |
+
with open('../../data/models/dec_clf/tokenizer.pkl', 'wb') as tokenizer_file:
|
149 |
+
pickle.dump(tokenizer, tokenizer_file)
|
150 |
+
|
151 |
+
model.save('../../data/models/dec_clf/nlp.h5')
|
152 |
+
|
153 |
+
def predict(text, model_path, token_path):
|
154 |
+
|
155 |
+
model = load_model(model_path)
|
156 |
+
|
157 |
+
with open(token_path, 'rb') as f:
|
158 |
+
tokenizer = pickle.load(f)
|
159 |
+
|
160 |
+
sequences = tokenizer.texts_to_sequences([text])
|
161 |
+
x_new = pad_sequences(sequences, maxlen=50)
|
162 |
+
predictions = model.predict([x_new, x_new])
|
163 |
+
|
164 |
+
mapping = {0: 'no', 1: 'yes'}
|
165 |
+
|
166 |
+
probs = list(predictions[0])
|
167 |
+
|
168 |
+
max_idx = np.argmax(probs)
|
169 |
+
|
170 |
+
return mapping[max_idx]
|
171 |
+
|
172 |
+
|
scripts/path_gen/paths_gen.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import langchain.document_loaders
|
2 |
+
|
3 |
+
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.schema import Document
|
6 |
+
from langchain.embeddings import OpenAIEmbeddings
|
7 |
+
from langchain.vectorstores.chroma import Chroma
|
8 |
+
import os
|
9 |
+
import shutil
|
10 |
+
|
11 |
+
from langchain.vectorstores.chroma import Chroma
|
12 |
+
from langchain.embeddings import OpenAIEmbeddings
|
13 |
+
from langchain.chat_models import ChatOpenAI
|
14 |
+
from langchain.prompts import ChatPromptTemplate
|
15 |
+
|
16 |
+
|
17 |
+
def get_chunks(file_path):
|
18 |
+
|
19 |
+
loader = PyPDFLoader(file_path)
|
20 |
+
|
21 |
+
documents = loader.load()
|
22 |
+
|
23 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
24 |
+
chunk_size=300,
|
25 |
+
chunk_overlap=100,
|
26 |
+
length_function=len,
|
27 |
+
add_start_index=True,
|
28 |
+
)
|
29 |
+
|
30 |
+
chunks = text_splitter.split_documents(documents)
|
31 |
+
|
32 |
+
return chunks
|
33 |
+
|
34 |
+
def get_vectordb(chunks):
|
35 |
+
|
36 |
+
# CHROMA_PATH = f"../../chroma/{CHROMA_PATH}"
|
37 |
+
|
38 |
+
db = Chroma.from_documents(chunks, embedding_function=OpenAIEmbeddings())
|
39 |
+
|
40 |
+
# if os.path.exists(CHROMA_PATH):
|
41 |
+
# db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
|
42 |
+
|
43 |
+
# else:
|
44 |
+
# db = Chroma.from_documents(
|
45 |
+
# chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
|
46 |
+
# )
|
47 |
+
|
48 |
+
# db.persist()
|
49 |
+
|
50 |
+
# print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
|
51 |
+
|
52 |
+
return db
|
53 |
+
|
54 |
+
|
55 |
+
def gen_sample(text, db):
|
56 |
+
|
57 |
+
PROMPT_TEMPLATE = """
|
58 |
+
Answer the question based only on the following context:
|
59 |
+
|
60 |
+
{context}
|
61 |
+
|
62 |
+
---
|
63 |
+
|
64 |
+
Answer the question based on the above context: {question}
|
65 |
+
"""
|
66 |
+
|
67 |
+
query_text = f"""
|
68 |
+
|
69 |
+
Act as the author of a Choose Your Own Adventure Book. This book is special as it is based on existing material.
|
70 |
+
Now, as with any choose your own adventure book, you'll have to generate decision paths at certain points in the story.
|
71 |
+
Your job is to generate 4 decision paths for the given point in the story, if applicable to that point in the story.
|
72 |
+
If the given part of the story doesn't contain any decisions from which to generate decision paths, don't
|
73 |
+
generate any. If the given part of the story contains a decision, generate 4 decision paths for that decision.
|
74 |
+
One among the 4 decision paths should be the original path, the other 3 should deviate from the original path in a sensible manner.
|
75 |
+
The decision paths should be generated in a way that they are coherent with the existing story.
|
76 |
+
The result should be a JSON object with the following keys: [text, paths]
|
77 |
+
|
78 |
+
text: The given text
|
79 |
+
paths: The generated decision paths as strings in a list
|
80 |
+
|
81 |
+
```{text}```
|
82 |
+
|
83 |
+
"""
|
84 |
+
|
85 |
+
results = db.similarity_search_with_relevance_scores(query_text, k=5)
|
86 |
+
|
87 |
+
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
|
88 |
+
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
|
89 |
+
prompt = prompt_template.format(context=context_text, question=query_text)
|
90 |
+
|
91 |
+
model = ChatOpenAI()
|
92 |
+
response_text = model.predict(prompt)
|
93 |
+
|
94 |
+
return eval(response_text)
|
scripts/summarization/__pycache__/falcon_summ.cpython-311.pyc
ADDED
Binary file (866 Bytes). View file
|
|
scripts/summarization/__pycache__/rag_summ.cpython-311.pyc
ADDED
Binary file (4.2 kB). View file
|
|
scripts/summarization/__pycache__/t5_summ.cpython-311.pyc
ADDED
Binary file (5.36 kB). View file
|
|
scripts/summarization/falcon_summ.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import pipeline
|
3 |
+
|
4 |
+
def prep_pipeline():
|
5 |
+
summarizer = pipeline("summarization", model=f"Falconsai/text_summarization")
|
6 |
+
|
7 |
+
return summarizer
|
8 |
+
|
9 |
+
def gen_summary(summarizer, text):
|
10 |
+
summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
|
11 |
+
|
12 |
+
return summary
|
scripts/summarization/graph_summ.py
ADDED
File without changes
|
scripts/summarization/rag_summ.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from langchain.document_loaders import PyPDFLoader
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.embeddings import OpenAIEmbeddings
|
6 |
+
from langchain.vectorstores.chroma import Chroma
|
7 |
+
import os
|
8 |
+
|
9 |
+
from langchain.vectorstores.chroma import Chroma
|
10 |
+
from langchain.embeddings import OpenAIEmbeddings
|
11 |
+
from langchain.chat_models import ChatOpenAI
|
12 |
+
from langchain.prompts import ChatPromptTemplate
|
13 |
+
|
14 |
+
|
15 |
+
import PyPDF2
|
16 |
+
|
17 |
+
def read_pages(pdf_file):
|
18 |
+
pages = []
|
19 |
+
|
20 |
+
reader = PyPDF2.PdfReader(pdf_file)
|
21 |
+
|
22 |
+
for page_number in range(len(reader.pages)):
|
23 |
+
|
24 |
+
page = reader.pages[page_number]
|
25 |
+
|
26 |
+
page_content = page.extract_text()
|
27 |
+
|
28 |
+
pages.append(page_content)
|
29 |
+
|
30 |
+
return pages
|
31 |
+
|
32 |
+
def get_chunks(file_path):
|
33 |
+
|
34 |
+
loader = PyPDFLoader(file_path)
|
35 |
+
|
36 |
+
documents = loader.load()
|
37 |
+
|
38 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
39 |
+
chunk_size=300,
|
40 |
+
chunk_overlap=100,
|
41 |
+
length_function=len,
|
42 |
+
add_start_index=True,
|
43 |
+
)
|
44 |
+
|
45 |
+
chunks = text_splitter.split_documents(documents)
|
46 |
+
|
47 |
+
return chunks
|
48 |
+
|
49 |
+
def get_vectordb(chunks, CHROMA_PATH):
|
50 |
+
|
51 |
+
CHROMA_PATH = f"../../data/chroma/{CHROMA_PATH}"
|
52 |
+
|
53 |
+
if os.path.exists(CHROMA_PATH):
|
54 |
+
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
|
55 |
+
|
56 |
+
else:
|
57 |
+
db = Chroma.from_documents(
|
58 |
+
chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
|
59 |
+
)
|
60 |
+
|
61 |
+
db.persist()
|
62 |
+
|
63 |
+
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
|
64 |
+
|
65 |
+
return db
|
66 |
+
|
67 |
+
def gen_summary(text, db):
|
68 |
+
|
69 |
+
PROMPT_TEMPLATE = """
|
70 |
+
Answer the question based only on the following context:
|
71 |
+
|
72 |
+
{context}
|
73 |
+
|
74 |
+
---
|
75 |
+
|
76 |
+
Answer the question based on the above context: {question}
|
77 |
+
"""
|
78 |
+
|
79 |
+
query_text = f"""
|
80 |
+
|
81 |
+
Summarize the given chunk from a story. The summary should be of narrartive nature and be around 5-7 sentences long.
|
82 |
+
|
83 |
+
```{text}```
|
84 |
+
|
85 |
+
Generate response in the following JSON format:
|
86 |
+
|
87 |
+
{{
|
88 |
+
"summary": "Your summary here.",
|
89 |
+
"text: "The original text here."
|
90 |
+
}}
|
91 |
+
|
92 |
+
"""
|
93 |
+
|
94 |
+
results = db.similarity_search_with_relevance_scores(query_text, k=5)
|
95 |
+
|
96 |
+
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
|
97 |
+
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
|
98 |
+
prompt = prompt_template.format(context=context_text, question=query_text)
|
99 |
+
|
100 |
+
model = ChatOpenAI()
|
101 |
+
response_text = model.predict(prompt)
|
102 |
+
|
103 |
+
return eval(response_text)
|
scripts/summarization/t5_summ.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
|
5 |
+
from transformers import DataCollatorForSeq2Seq
|
6 |
+
|
7 |
+
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
|
8 |
+
|
9 |
+
from transformers import pipeline
|
10 |
+
|
11 |
+
checkpoint = "Falconsai/text_summarization"
|
12 |
+
|
13 |
+
output_dir = "falcon-summ"
|
14 |
+
|
15 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
16 |
+
|
17 |
+
import numpy as np
|
18 |
+
|
19 |
+
import evaluate
|
20 |
+
|
21 |
+
rouge = evaluate.load("rouge")
|
22 |
+
|
23 |
+
def compute_metrics(eval_pred):
|
24 |
+
predictions, labels = eval_pred
|
25 |
+
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
|
26 |
+
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
27 |
+
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
28 |
+
|
29 |
+
result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
|
30 |
+
|
31 |
+
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
|
32 |
+
result["gen_len"] = np.mean(prediction_lens)
|
33 |
+
|
34 |
+
return {k: round(v, 4) for k, v in result.items()}
|
35 |
+
|
36 |
+
def preprocess_function(examples, max_length=1024, max_target_length=128):
|
37 |
+
|
38 |
+
prefix = "summarize: "
|
39 |
+
|
40 |
+
inputs = [prefix + doc for doc in examples["text"]]
|
41 |
+
|
42 |
+
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
|
43 |
+
|
44 |
+
labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
|
45 |
+
|
46 |
+
model_inputs["labels"] = labels["input_ids"]
|
47 |
+
|
48 |
+
return model_inputs
|
49 |
+
|
50 |
+
def prep_data():
|
51 |
+
billsum = load_dataset("billsum", split="ca_test")
|
52 |
+
|
53 |
+
billsum = billsum.train_test_split(test_size=0.2)
|
54 |
+
|
55 |
+
return billsum
|
56 |
+
|
57 |
+
def prep_model():
|
58 |
+
|
59 |
+
billsum = prep_data()
|
60 |
+
|
61 |
+
tokenized_billsum = billsum.map(preprocess_function, batched=True)
|
62 |
+
|
63 |
+
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
|
64 |
+
|
65 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
|
66 |
+
|
67 |
+
training_args = Seq2SeqTrainingArguments(
|
68 |
+
output_dir=output_dir,
|
69 |
+
evaluation_strategy="epoch",
|
70 |
+
learning_rate=2e-5,
|
71 |
+
per_device_train_batch_size=16,
|
72 |
+
per_device_eval_batch_size=16,
|
73 |
+
weight_decay=0.01,
|
74 |
+
save_total_limit=3,
|
75 |
+
num_train_epochs=30,
|
76 |
+
predict_with_generate=True,
|
77 |
+
fp16=True,
|
78 |
+
push_to_hub=True,
|
79 |
+
)
|
80 |
+
|
81 |
+
trainer = Seq2SeqTrainer(
|
82 |
+
model=model,
|
83 |
+
args=training_args,
|
84 |
+
train_dataset=tokenized_billsum["train"],
|
85 |
+
eval_dataset=tokenized_billsum["test"],
|
86 |
+
tokenizer=tokenizer,
|
87 |
+
data_collator=data_collator,
|
88 |
+
compute_metrics=compute_metrics,
|
89 |
+
)
|
90 |
+
|
91 |
+
return trainer
|
92 |
+
|
93 |
+
def train_model(trainer):
|
94 |
+
|
95 |
+
trainer.train()
|
96 |
+
|
97 |
+
trainer.save_model(output_dir)
|
98 |
+
|
99 |
+
trainer.push_to_hub()
|
100 |
+
|
101 |
+
def prep_pipeline():
|
102 |
+
summarizer = pipeline("summarization", model=f"suneeln-duke/{output_dir}")
|
103 |
+
|
104 |
+
return summarizer
|
105 |
+
|
106 |
+
def gen_summary(summarizer, text):
|
107 |
+
summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
|
108 |
+
|
109 |
+
return summary
|
scripts/text_gen/story_gen.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import langchain.document_loaders
|
2 |
+
|
3 |
+
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.schema import Document
|
6 |
+
from langchain.embeddings import OpenAIEmbeddings
|
7 |
+
from langchain.vectorstores.chroma import Chroma
|
8 |
+
import os
|
9 |
+
import shutil
|
10 |
+
|
11 |
+
from langchain.vectorstores.chroma import Chroma
|
12 |
+
from langchain.embeddings import OpenAIEmbeddings
|
13 |
+
from langchain.chat_models import ChatOpenAI
|
14 |
+
from langchain.prompts import ChatPromptTemplate
|
15 |
+
|
16 |
+
|
17 |
+
def get_chunks(file_path):
|
18 |
+
|
19 |
+
loader = PyPDFLoader(file_path)
|
20 |
+
|
21 |
+
documents = loader.load()
|
22 |
+
|
23 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
24 |
+
chunk_size=300,
|
25 |
+
chunk_overlap=100,
|
26 |
+
length_function=len,
|
27 |
+
add_start_index=True,
|
28 |
+
)
|
29 |
+
|
30 |
+
chunks = text_splitter.split_documents(documents)
|
31 |
+
|
32 |
+
return chunks
|
33 |
+
|
34 |
+
def get_vectordb(chunks):
|
35 |
+
|
36 |
+
db = Chroma.from_documents(chunks, embedding_function=OpenAIEmbeddings())
|
37 |
+
|
38 |
+
# if os.path.exists(CHROMA_PATH):
|
39 |
+
# db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
|
40 |
+
|
41 |
+
# else:
|
42 |
+
# db = Chroma.from_documents(
|
43 |
+
# chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
|
44 |
+
# )
|
45 |
+
|
46 |
+
# db.persist()
|
47 |
+
|
48 |
+
# print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
|
49 |
+
|
50 |
+
return db
|
51 |
+
|
52 |
+
|
53 |
+
def gen_sample(text, decision, db):
|
54 |
+
|
55 |
+
PROMPT_TEMPLATE = """
|
56 |
+
Answer the question based only on the following context:
|
57 |
+
|
58 |
+
{context}
|
59 |
+
|
60 |
+
---
|
61 |
+
|
62 |
+
Answer the question based on the above context: {question}
|
63 |
+
"""
|
64 |
+
|
65 |
+
query_text = f"""
|
66 |
+
|
67 |
+
Act as the author of a Choose Your Own Adventure Book. This book is special as it is based on existing material.
|
68 |
+
Now, as with any choose your own adventure book, there are inifinite paths based on the choices a user makes.
|
69 |
+
Given some relevant text and the decision taken with respect to the relevant text, generate the next part of the story.
|
70 |
+
It should be within 6-8 sentences and be coherent as it were actually part of the story.
|
71 |
+
|
72 |
+
Relevant: {text}
|
73 |
+
|
74 |
+
Decision: {decision}
|
75 |
+
|
76 |
+
"""
|
77 |
+
|
78 |
+
results = db.similarity_search_with_relevance_scores(query_text, k=5)
|
79 |
+
|
80 |
+
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
|
81 |
+
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
|
82 |
+
prompt = prompt_template.format(context=context_text, question=query_text)
|
83 |
+
|
84 |
+
model = ChatOpenAI()
|
85 |
+
response_text = model.predict(prompt)
|
86 |
+
|
87 |
+
return eval(response_text)
|