suneeln-duke commited on
Commit
6c57304
1 Parent(s): cc84091
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.2
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ RUN apt update && apt install -y ffmpeg
10
+
11
+ RUN useradd -m -u 1000 user
12
+ USER user
13
+ ENV HOME=/home/user \
14
+ PATH=/home/user/.local/bin:$PATH
15
+
16
+ WORKDIR $HOME/app
17
+
18
+ COPY --chown=user . $HOME/app
19
+
20
+ ENV H2O_WAVE_LISTEN=":7860"
21
+ ENV H2O_WAVE_ADDRESS="http://127.0.0.1:7860"
22
+
23
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
data/dune.pdf ADDED
Binary file (333 kB). View file
 
main.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+
3
+ import os
4
+
5
+ from langchain.document_loaders import DirectoryLoader
6
+
7
+ import wandb
8
+
9
+ import huggingface_hub
10
+
11
+ from scripts.summarization import falcon_summ
12
+
13
+ from scripts.decision_clf import seq_clf
14
+
15
+ from scripts.path_gen import paths_gen
16
+
17
+ from scripts.text_gen import story_gen
18
+
19
+ app = FastAPI()
20
+
21
+ wandb.login(key = os.getenv('wandb_key'))
22
+
23
+ huggingface_hub.login(token = os.getenv('hf_key'))
24
+
25
+ os.environ['OPENAI_API_KEY'] = os.getenv('openapi_key')
26
+
27
+ summarizer = falcon_summ.prep_pipeline()
28
+
29
+ token_path= 'models/dec_clf/tokenizer.pkl'
30
+
31
+ model_path = 'models/dec_clf/nlp.h5'
32
+
33
+ chunks = paths_gen.get_chunks("data/dune.pdf")
34
+
35
+ db = paths_gen.get_vectordb(chunks)
36
+
37
+ @app.get("/hello")
38
+ def hello():
39
+ return {"message": "Hello World"}
40
+
41
+ @app.post("/summ")
42
+ def summ(text: str):
43
+ return { "summary": falcon_summ.gen_summary(summarizer, text)}
44
+
45
+ @app.post("/clf")
46
+ def clf(text: str):
47
+ return {"decision": seq_clf.predict(text, model_path, token_path)}
48
+
49
+ @app.post("/gen_path")
50
+ def gen_path(text: str):
51
+ return paths_gen.gen_sample(text, db)
52
+
53
+ @app.post("/gen_story")
54
+ def gen_story(text: str, decision: str):
55
+ return story_gen.gen_sample(text, decision, db)
56
+
models/dec_clf/nlp.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36e39f6a5bd037d9180031701950af7caedd3c27e44d68d6f4ee2361fbdc41df
3
+ size 8117880
models/dec_clf/tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a432c203344b2bc0ac2c6fe4e0f50bf2003ea4b0b715f50a8a999a38907f11b8
3
+ size 307491
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pandas
4
+ chromadb==0.3.29
5
+ keras==2.13.1
6
+ tensorflow==2.13.0
7
+ torch==2.0.0
8
+ wandb
9
+ openapi
10
+ langchain==0.1.16
11
+ langchain-community==0.0.34
12
+ langchain-core==0.1.45
13
+ langchain-text-splitters==0.0.1
14
+ tiktoken
15
+ langchain
16
+ unstructured==0.13.3
17
+ transformers==4.36.1
18
+ scikit-learn==1.4.2
scripts/decision_clf/__pycache__/falcon_clf.cpython-311.pyc ADDED
Binary file (12.3 kB). View file
 
scripts/decision_clf/__pycache__/rag_clf.cpython-311.pyc ADDED
Binary file (4.78 kB). View file
 
scripts/decision_clf/__pycache__/seq_clf.cpython-311.pyc ADDED
Binary file (8.12 kB). View file
 
scripts/decision_clf/falcon_clf.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import numpy as np
4
+
5
+ import pandas as pd
6
+
7
+ import os
8
+
9
+ from tqdm import tqdm
10
+
11
+ from transformers import pipeline
12
+
13
+ from transformers import AutoTokenizer, FalconForCausalLM
14
+
15
+ import torch
16
+
17
+ from datasets import Dataset
18
+
19
+ from peft import LoraConfig
20
+
21
+ from trl import SFTTrainer
22
+
23
+ from transformers import (
24
+ AutoTokenizer,
25
+ BitsAndBytesConfig,
26
+ TrainingArguments,
27
+ pipeline,
28
+ )
29
+ from sklearn.metrics import (accuracy_score,
30
+ classification_report,
31
+ confusion_matrix)
32
+ from sklearn.model_selection import train_test_split
33
+
34
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
35
+
36
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
37
+
38
+ import warnings
39
+
40
+ warnings.filterwarnings("ignore")
41
+
42
+ def generate_prompt(data_point):
43
+ return f"""### Instruction:
44
+ Classify whether the given chunk involves a decision that will effect the story or not.
45
+ A decision is defined as when the character goes about making a choice between two or more options.
46
+ The decision should be significant enough to affect the story in a major way.
47
+ It doesn't really involve emotions, feelings or thoughts, but what the character does, or what happens to them.
48
+ This involes interactions between characters, or the character and the environment.
49
+ What isn't a decision is chunks describing the setting, or the character's thoughts or feelings.
50
+ Return the answer as the corresponding decision label "yes" or "no"
51
+
52
+ ### Text:
53
+ {data_point["text"]}
54
+
55
+ ### Decision:
56
+ {data_point["decision"]}
57
+ """
58
+
59
+ def generate_test_prompt(data_point):
60
+ return f"""### Instruction:
61
+ Classify whether the given chunk involves a decision that will effect the story or not.
62
+ A decision is defined as when the character goes about making a choice between two or more options.
63
+ The decision should be significant enough to affect the story in a major way.
64
+ It doesn't really involve emotions, feelings or thoughts, but what the character does, or what happens to them.
65
+ This involes interactions between characters, or the character and the environment.
66
+ What isn't a decision is chunks describing the setting, or the character's thoughts or feelings.
67
+ Return the answer as the corresponding decision label "yes" or "no"
68
+
69
+ ### Text:
70
+ {data_point["text"]}
71
+
72
+ ### Decision:
73
+ """
74
+
75
+ def predict(X_test, model, tokenizer):
76
+ y_pred = []
77
+ for i in tqdm(range(len(X_test))):
78
+ prompt = X_test.iloc[i]["text"]
79
+ pipe = pipeline(task="text-generation",
80
+ model=model,
81
+ tokenizer=tokenizer,
82
+ max_new_tokens = 1,
83
+ temperature = 0.0,
84
+ )
85
+ result = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)
86
+ answer = result[0]['generated_text'].split("=")[-1].lower()
87
+ if "yes" in answer:
88
+ y_pred.append("yes")
89
+ elif "no" in answer:
90
+ y_pred.append("no")
91
+ else:
92
+ y_pred.append("none")
93
+ return y_pred
94
+
95
+ def evaluate(y_true, y_pred):
96
+ labels = ['yes', 'no', 'none']
97
+ mapping = {"yes": 1, "no": 0, 'none':2}
98
+ def map_func(x):
99
+ return mapping.get(x, 1)
100
+
101
+ y_true = np.vectorize(map_func)(y_true)
102
+ y_pred = np.vectorize(map_func)(y_pred)
103
+
104
+ # Calculate accuracy
105
+ accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
106
+ print(f'Accuracy: {accuracy:.3f}')
107
+
108
+ # Generate accuracy report
109
+ unique_labels = set(y_true) # Get unique labels
110
+
111
+ for label in unique_labels:
112
+ label_indices = [i for i in range(len(y_true))
113
+ if y_true[i] == label]
114
+ label_y_true = [y_true[i] for i in label_indices]
115
+ label_y_pred = [y_pred[i] for i in label_indices]
116
+ accuracy = accuracy_score(label_y_true, label_y_pred)
117
+ print(f'Accuracy for label {label}: {accuracy:.3f}')
118
+
119
+ # Generate classification report
120
+ class_report = classification_report(y_true=y_true, y_pred=y_pred)
121
+ print('\nClassification Report:')
122
+ print(class_report)
123
+
124
+ # Generate confusion matrix
125
+ conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
126
+ print('\nConfusion Matrix:')
127
+ print(conf_matrix)
128
+
129
+
130
+ def prep_data():
131
+ filename = '../../data/output/decisions.csv'
132
+
133
+ df = pd.read_csv(filename, encoding="utf-8", encoding_errors="replace")
134
+
135
+ df = df[['text', 'decision']]
136
+
137
+ X_train = list()
138
+
139
+ X_test = list()
140
+
141
+ for decision in ["yes", "no"]:
142
+ train, test = train_test_split(df[df.decision==decision],
143
+ train_size=.8,
144
+ test_size=.2,
145
+ random_state=42)
146
+ X_train.append(train)
147
+ X_test.append(test)
148
+
149
+ X_train = pd.concat(X_train).sample(frac=1, random_state=10)
150
+
151
+ X_test = pd.concat(X_test)
152
+
153
+ eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
154
+
155
+ X_eval = df[df.index.isin(eval_idx)]
156
+
157
+ X_eval = (X_eval
158
+ .groupby('decision', group_keys=False)
159
+ .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
160
+
161
+ X_train = X_train.reset_index(drop=True)
162
+
163
+ X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1),
164
+ columns=["text"])
165
+
166
+ X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1),
167
+ columns=["text"])
168
+
169
+ y_true = X_test.decision
170
+
171
+ X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])
172
+
173
+ train_data = Dataset.from_pandas(X_train)
174
+
175
+ eval_data = Dataset.from_pandas(X_eval)
176
+
177
+ return train_data, eval_data
178
+
179
+
180
+ def prep_model():
181
+ model_name = "Rocketknight1/falcon-rw-1b"
182
+
183
+ compute_dtype = getattr(torch, "float16")
184
+
185
+ bnb_config = BitsAndBytesConfig(
186
+ load_in_4bit=True,
187
+ bnb_4bit_use_double_quant=False,
188
+ bnb_4bit_quant_type="nf4",
189
+ bnb_4bit_compute_dtype=compute_dtype,
190
+ )
191
+
192
+ model = FalconForCausalLM.from_pretrained(
193
+ model_name,
194
+ device_map="auto",
195
+ quantization_config=bnb_config,
196
+ )
197
+
198
+ model.config.use_cache = False
199
+ model.config.pretraining_tp = 1
200
+
201
+ tokenizer = AutoTokenizer.from_pretrained(model_name,
202
+ trust_remote_code=True,
203
+ padding_side="left",
204
+ add_bos_token=True,
205
+ add_eos_token=True,
206
+ )
207
+
208
+ tokenizer.pad_token = tokenizer.eos_token
209
+
210
+ return model, tokenizer
211
+
212
+
213
+ def prep_trainer():
214
+ OUTPUT_DIR = "falcon-clf"
215
+
216
+ train_data, eval_data = prep_data()
217
+
218
+ model, tokenizer = prep_model()
219
+
220
+ peft_config = LoraConfig(
221
+ lora_alpha=16,
222
+ lora_dropout=0.1,
223
+ r=64,
224
+ bias="none",
225
+ task_type="CAUSAL_LM",
226
+ )
227
+
228
+ training_arguments = TrainingArguments(
229
+ output_dir=OUTPUT_DIR,
230
+ num_train_epochs=20,
231
+ per_device_train_batch_size=1,
232
+ gradient_accumulation_steps=8, # 4
233
+ optim="paged_adamw_32bit",
234
+ save_steps=0,
235
+ logging_steps=10,
236
+ learning_rate=2e-4,
237
+ weight_decay=0.001,
238
+ fp16=True,
239
+ bf16=False,
240
+ max_grad_norm=0.3,
241
+ max_steps=-1,
242
+ warmup_ratio=0.03,
243
+ group_by_length=True,
244
+ lr_scheduler_type="cosine",
245
+ report_to="tensorboard",
246
+ evaluation_strategy="epoch"
247
+ )
248
+
249
+ trainer = SFTTrainer(
250
+ model=model,
251
+ train_dataset=train_data,
252
+ eval_dataset=eval_data,
253
+ peft_config=peft_config,
254
+ dataset_text_field="text",
255
+ tokenizer=tokenizer,
256
+ args=training_arguments,
257
+ packing=False,
258
+ max_seq_length=1024,
259
+ )
260
+
261
+ return trainer
262
+
263
+ def train_model():
264
+
265
+ trainer = prep_trainer()
266
+
267
+ trainer.train()
268
+
269
+ trainer.model.save_pretrained("falcon-clf")
270
+
271
+ trainer.push_to_hub()
272
+
273
+ def get_classifier():
274
+ classifier = pipeline(model=f"suneeln-duke/falcon-clf", device_map="auto")
275
+
276
+ return classifier
277
+
278
+ def classify_dec(text, classifier):
279
+
280
+ text = generate_test_prompt({
281
+ 'text': text
282
+ })
283
+
284
+ result = classifier(text, pad_token_id=classifier.tokenizer.eos_token_id)
285
+
286
+ answer = result[0]['generated_text'].split("=")[-1].lower()
287
+
288
+ if "yes" in answer:
289
+ return "yes"
290
+ elif "no" in answer:
291
+ return "no"
scripts/decision_clf/rag_clf.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+
5
+ import openai
6
+
7
+ import langchain
8
+
9
+ import langchain.document_loaders
10
+
11
+ from langchain.document_loaders import DirectoryLoader, PyPDFLoader
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain.schema import Document
14
+ from langchain.embeddings import OpenAIEmbeddings
15
+ from langchain.vectorstores.chroma import Chroma
16
+ import os
17
+ import shutil
18
+
19
+ from langchain.vectorstores.chroma import Chroma
20
+ from langchain.embeddings import OpenAIEmbeddings
21
+ from langchain.chat_models import ChatOpenAI
22
+ from langchain.prompts import ChatPromptTemplate
23
+
24
+ # read from config.ini file
25
+
26
+
27
+ import PyPDF2
28
+
29
+ def read_pages(pdf_file):
30
+ pages = []
31
+
32
+ reader = PyPDF2.PdfReader(pdf_file)
33
+
34
+ for page_number in range(len(reader.pages)):
35
+
36
+ page = reader.pages[page_number]
37
+
38
+ page_content = page.extract_text()
39
+
40
+ pages.append(page_content)
41
+
42
+ return pages
43
+
44
+ def get_chunks(file_path):
45
+
46
+ loader = PyPDFLoader(file_path)
47
+
48
+ documents = loader.load()
49
+
50
+ text_splitter = RecursiveCharacterTextSplitter(
51
+ chunk_size=300,
52
+ chunk_overlap=100,
53
+ length_function=len,
54
+ add_start_index=True,
55
+ )
56
+
57
+ chunks = text_splitter.split_documents(documents)
58
+
59
+ return chunks
60
+
61
+ def get_vectordb(chunks, CHROMA_PATH):
62
+
63
+ CHROMA_PATH = f"../../data/chroma/{CHROMA_PATH}"
64
+
65
+ if os.path.exists(CHROMA_PATH):
66
+ db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
67
+
68
+ else:
69
+ db = Chroma.from_documents(
70
+ chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
71
+ )
72
+
73
+ db.persist()
74
+
75
+ print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
76
+
77
+ return db
78
+
79
+ def classify_dec(text, db):
80
+
81
+ PROMPT_TEMPLATE = """
82
+ Answer the question based only on the following context:
83
+
84
+ {context}
85
+
86
+ ---
87
+
88
+ Answer the question based on the above context: {question}
89
+ """
90
+
91
+ query_text = f"""
92
+
93
+ Classify whether the given chunk involves a decision that will effect the story or not.
94
+
95
+ A decision is defined as when the character goes about making a choice between two or more options.
96
+ The decision should be significant enough to affect the story in a major way.
97
+ It doesn't really involve emotions, feelings or thoughts, but what the character does, or what happens to them.
98
+ This involes interactions between characters, or the character and the environment.
99
+ What isn't a decision is chunks describing the setting, or the character's thoughts or feelings.
100
+
101
+ Return the answer as the corresponding decision label "yes" or "no"
102
+
103
+ {text}
104
+
105
+ """
106
+
107
+ results = db.similarity_search_with_relevance_scores(query_text, k=5)
108
+
109
+ context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
110
+ prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
111
+ prompt = prompt_template.format(context=context_text, question=query_text)
112
+
113
+ model = ChatOpenAI()
114
+ response_text = model.predict(prompt)
115
+
116
+ return (response_text)
scripts/decision_clf/seq_clf.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ import numpy as np
4
+
5
+ from tensorflow.keras.preprocessing.text import Tokenizer
6
+
7
+ from tensorflow.keras.optimizers import Adamax
8
+
9
+ from tensorflow.keras.metrics import Precision, Recall
10
+
11
+ from tensorflow.keras.layers import Dense, ReLU
12
+
13
+ from tensorflow.keras.layers import Embedding, BatchNormalization, Concatenate
14
+
15
+ from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dropout
16
+
17
+ from tensorflow.keras.models import Sequential, Model
18
+
19
+ from sklearn.preprocessing import LabelEncoder
20
+
21
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
22
+
23
+ from keras.utils import to_categorical
24
+
25
+ from sklearn.model_selection import train_test_split
26
+
27
+ import pickle
28
+
29
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
30
+
31
+ import pickle
32
+
33
+ from tensorflow.keras.models import load_model
34
+
35
+ def prep_data():
36
+
37
+ # Assuming df is your DataFrame and you want to split based on 'col' column
38
+ # You can adjust the test_size and val_size to change the split proportions
39
+ train_size = 0.9
40
+ test_size = 0.05
41
+ val_size = 0.05
42
+
43
+ df = pd.read_csv('../../data/output/decisions.csv')
44
+
45
+ df = df[['text', 'decision']]
46
+
47
+ # First split into train and (test + val)
48
+ df, test_val_df = train_test_split(df, test_size=(test_size + val_size), random_state=42)
49
+
50
+ # Then split test_val_df into test and validation sets
51
+ test_df, val_df = train_test_split(test_val_df, test_size=val_size/(test_size + val_size), random_state=42)
52
+
53
+ return df, test_df, val_df
54
+
55
+ def split_data():
56
+
57
+ df, test_df, val_df = prep_data()
58
+
59
+ X_train = df['text']
60
+ y_train = df['decision']
61
+
62
+ X_test = test_df['text']
63
+ y_test = test_df['decision']
64
+
65
+ X_val = val_df['text']
66
+ y_val = val_df['decision']
67
+
68
+ encoder = LabelEncoder()
69
+
70
+ y_train = encoder.fit_transform(y_train)
71
+
72
+ y_val = encoder.transform(y_val)
73
+
74
+ y_test = encoder.transform(y_test)
75
+
76
+ mapping = dict(zip(encoder.classes_, range(len(encoder.classes_))))
77
+
78
+ return X_train, y_train, X_test, y_test, X_val, y_val, mapping
79
+
80
+ def prep_model():
81
+
82
+ max_words = 10000
83
+
84
+ max_len = 50
85
+
86
+ embedding_dim = 32
87
+
88
+ # Branch 1
89
+ branch1 = Sequential()
90
+ branch1.add(Embedding(max_words, embedding_dim, input_length=max_len))
91
+ branch1.add(Conv1D(64, 3, padding='same', activation='relu'))
92
+ branch1.add(BatchNormalization())
93
+ branch1.add(ReLU())
94
+ branch1.add(Dropout(0.5))
95
+ branch1.add(GlobalMaxPooling1D())
96
+
97
+ # Branch 2
98
+ branch2 = Sequential()
99
+ branch2.add(Embedding(max_words, embedding_dim, input_length=max_len))
100
+ branch2.add(Conv1D(64, 3, padding='same', activation='relu'))
101
+ branch2.add(BatchNormalization())
102
+ branch2.add(ReLU())
103
+ branch2.add(Dropout(0.5))
104
+ branch2.add(GlobalMaxPooling1D())
105
+
106
+ concatenated = Concatenate()([branch1.output, branch2.output])
107
+
108
+ hid_layer = Dense(128, activation='relu')(concatenated)
109
+ dropout = Dropout(0.3)(hid_layer)
110
+ output_layer = Dense(2, activation='softmax')(dropout)
111
+
112
+ model = Model(inputs=[branch1.input, branch2.input], outputs=output_layer)
113
+
114
+ model.compile(optimizer='adamax',
115
+ loss='binary_crossentropy',
116
+ metrics=['accuracy', Precision(), Recall()])
117
+
118
+ return model
119
+
120
+ def train_model():
121
+
122
+ X_train, y_train, X_test, y_test, X_val, y_val, mapping = split_data()
123
+
124
+ tokenizer = Tokenizer(num_words=10000)
125
+ tokenizer.fit_on_texts(X_train)
126
+
127
+ sequences = tokenizer.texts_to_sequences(X_train)
128
+
129
+ tr_x = pad_sequences(sequences, maxlen=50)
130
+ tr_y = to_categorical(y_train)
131
+
132
+ sequences = tokenizer.texts_to_sequences(X_val)
133
+ val_x = pad_sequences(sequences, maxlen=50)
134
+ val_y = to_categorical(y_val)
135
+
136
+ sequences = tokenizer.texts_to_sequences(X_test)
137
+ ts_x = pad_sequences(sequences, maxlen=50)
138
+ ts_y = to_categorical(y_test)
139
+
140
+ model = prep_model()
141
+
142
+ batch_size = 256
143
+ epochs = 100
144
+ history = model.fit([tr_x, tr_x], tr_y, epochs=epochs, batch_size=batch_size,
145
+ validation_data=([val_x, val_x], val_y))
146
+
147
+
148
+ with open('../../data/models/dec_clf/tokenizer.pkl', 'wb') as tokenizer_file:
149
+ pickle.dump(tokenizer, tokenizer_file)
150
+
151
+ model.save('../../data/models/dec_clf/nlp.h5')
152
+
153
+ def predict(text, model_path, token_path):
154
+
155
+ model = load_model(model_path)
156
+
157
+ with open(token_path, 'rb') as f:
158
+ tokenizer = pickle.load(f)
159
+
160
+ sequences = tokenizer.texts_to_sequences([text])
161
+ x_new = pad_sequences(sequences, maxlen=50)
162
+ predictions = model.predict([x_new, x_new])
163
+
164
+ mapping = {0: 'no', 1: 'yes'}
165
+
166
+ probs = list(predictions[0])
167
+
168
+ max_idx = np.argmax(probs)
169
+
170
+ return mapping[max_idx]
171
+
172
+
scripts/path_gen/paths_gen.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import langchain.document_loaders
2
+
3
+ from langchain.document_loaders import DirectoryLoader, PyPDFLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.schema import Document
6
+ from langchain.embeddings import OpenAIEmbeddings
7
+ from langchain.vectorstores.chroma import Chroma
8
+ import os
9
+ import shutil
10
+
11
+ from langchain.vectorstores.chroma import Chroma
12
+ from langchain.embeddings import OpenAIEmbeddings
13
+ from langchain.chat_models import ChatOpenAI
14
+ from langchain.prompts import ChatPromptTemplate
15
+
16
+
17
+ def get_chunks(file_path):
18
+
19
+ loader = PyPDFLoader(file_path)
20
+
21
+ documents = loader.load()
22
+
23
+ text_splitter = RecursiveCharacterTextSplitter(
24
+ chunk_size=300,
25
+ chunk_overlap=100,
26
+ length_function=len,
27
+ add_start_index=True,
28
+ )
29
+
30
+ chunks = text_splitter.split_documents(documents)
31
+
32
+ return chunks
33
+
34
+ def get_vectordb(chunks):
35
+
36
+ # CHROMA_PATH = f"../../chroma/{CHROMA_PATH}"
37
+
38
+ db = Chroma.from_documents(chunks, embedding_function=OpenAIEmbeddings())
39
+
40
+ # if os.path.exists(CHROMA_PATH):
41
+ # db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
42
+
43
+ # else:
44
+ # db = Chroma.from_documents(
45
+ # chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
46
+ # )
47
+
48
+ # db.persist()
49
+
50
+ # print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
51
+
52
+ return db
53
+
54
+
55
+ def gen_sample(text, db):
56
+
57
+ PROMPT_TEMPLATE = """
58
+ Answer the question based only on the following context:
59
+
60
+ {context}
61
+
62
+ ---
63
+
64
+ Answer the question based on the above context: {question}
65
+ """
66
+
67
+ query_text = f"""
68
+
69
+ Act as the author of a Choose Your Own Adventure Book. This book is special as it is based on existing material.
70
+ Now, as with any choose your own adventure book, you'll have to generate decision paths at certain points in the story.
71
+ Your job is to generate 4 decision paths for the given point in the story, if applicable to that point in the story.
72
+ If the given part of the story doesn't contain any decisions from which to generate decision paths, don't
73
+ generate any. If the given part of the story contains a decision, generate 4 decision paths for that decision.
74
+ One among the 4 decision paths should be the original path, the other 3 should deviate from the original path in a sensible manner.
75
+ The decision paths should be generated in a way that they are coherent with the existing story.
76
+ The result should be a JSON object with the following keys: [text, paths]
77
+
78
+ text: The given text
79
+ paths: The generated decision paths as strings in a list
80
+
81
+ ```{text}```
82
+
83
+ """
84
+
85
+ results = db.similarity_search_with_relevance_scores(query_text, k=5)
86
+
87
+ context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
88
+ prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
89
+ prompt = prompt_template.format(context=context_text, question=query_text)
90
+
91
+ model = ChatOpenAI()
92
+ response_text = model.predict(prompt)
93
+
94
+ return eval(response_text)
scripts/summarization/__pycache__/falcon_summ.cpython-311.pyc ADDED
Binary file (866 Bytes). View file
 
scripts/summarization/__pycache__/rag_summ.cpython-311.pyc ADDED
Binary file (4.2 kB). View file
 
scripts/summarization/__pycache__/t5_summ.cpython-311.pyc ADDED
Binary file (5.36 kB). View file
 
scripts/summarization/falcon_summ.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers import pipeline
3
+
4
+ def prep_pipeline():
5
+ summarizer = pipeline("summarization", model=f"Falconsai/text_summarization")
6
+
7
+ return summarizer
8
+
9
+ def gen_summary(summarizer, text):
10
+ summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
11
+
12
+ return summary
scripts/summarization/graph_summ.py ADDED
File without changes
scripts/summarization/rag_summ.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from langchain.document_loaders import PyPDFLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings
6
+ from langchain.vectorstores.chroma import Chroma
7
+ import os
8
+
9
+ from langchain.vectorstores.chroma import Chroma
10
+ from langchain.embeddings import OpenAIEmbeddings
11
+ from langchain.chat_models import ChatOpenAI
12
+ from langchain.prompts import ChatPromptTemplate
13
+
14
+
15
+ import PyPDF2
16
+
17
+ def read_pages(pdf_file):
18
+ pages = []
19
+
20
+ reader = PyPDF2.PdfReader(pdf_file)
21
+
22
+ for page_number in range(len(reader.pages)):
23
+
24
+ page = reader.pages[page_number]
25
+
26
+ page_content = page.extract_text()
27
+
28
+ pages.append(page_content)
29
+
30
+ return pages
31
+
32
+ def get_chunks(file_path):
33
+
34
+ loader = PyPDFLoader(file_path)
35
+
36
+ documents = loader.load()
37
+
38
+ text_splitter = RecursiveCharacterTextSplitter(
39
+ chunk_size=300,
40
+ chunk_overlap=100,
41
+ length_function=len,
42
+ add_start_index=True,
43
+ )
44
+
45
+ chunks = text_splitter.split_documents(documents)
46
+
47
+ return chunks
48
+
49
+ def get_vectordb(chunks, CHROMA_PATH):
50
+
51
+ CHROMA_PATH = f"../../data/chroma/{CHROMA_PATH}"
52
+
53
+ if os.path.exists(CHROMA_PATH):
54
+ db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
55
+
56
+ else:
57
+ db = Chroma.from_documents(
58
+ chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
59
+ )
60
+
61
+ db.persist()
62
+
63
+ print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
64
+
65
+ return db
66
+
67
+ def gen_summary(text, db):
68
+
69
+ PROMPT_TEMPLATE = """
70
+ Answer the question based only on the following context:
71
+
72
+ {context}
73
+
74
+ ---
75
+
76
+ Answer the question based on the above context: {question}
77
+ """
78
+
79
+ query_text = f"""
80
+
81
+ Summarize the given chunk from a story. The summary should be of narrartive nature and be around 5-7 sentences long.
82
+
83
+ ```{text}```
84
+
85
+ Generate response in the following JSON format:
86
+
87
+ {{
88
+ "summary": "Your summary here.",
89
+ "text: "The original text here."
90
+ }}
91
+
92
+ """
93
+
94
+ results = db.similarity_search_with_relevance_scores(query_text, k=5)
95
+
96
+ context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
97
+ prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
98
+ prompt = prompt_template.format(context=context_text, question=query_text)
99
+
100
+ model = ChatOpenAI()
101
+ response_text = model.predict(prompt)
102
+
103
+ return eval(response_text)
scripts/summarization/t5_summ.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ from transformers import AutoTokenizer
4
+
5
+ from transformers import DataCollatorForSeq2Seq
6
+
7
+ from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
8
+
9
+ from transformers import pipeline
10
+
11
+ checkpoint = "Falconsai/text_summarization"
12
+
13
+ output_dir = "falcon-summ"
14
+
15
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
16
+
17
+ import numpy as np
18
+
19
+ import evaluate
20
+
21
+ rouge = evaluate.load("rouge")
22
+
23
+ def compute_metrics(eval_pred):
24
+ predictions, labels = eval_pred
25
+ decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
26
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
27
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
28
+
29
+ result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
30
+
31
+ prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
32
+ result["gen_len"] = np.mean(prediction_lens)
33
+
34
+ return {k: round(v, 4) for k, v in result.items()}
35
+
36
+ def preprocess_function(examples, max_length=1024, max_target_length=128):
37
+
38
+ prefix = "summarize: "
39
+
40
+ inputs = [prefix + doc for doc in examples["text"]]
41
+
42
+ model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
43
+
44
+ labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
45
+
46
+ model_inputs["labels"] = labels["input_ids"]
47
+
48
+ return model_inputs
49
+
50
+ def prep_data():
51
+ billsum = load_dataset("billsum", split="ca_test")
52
+
53
+ billsum = billsum.train_test_split(test_size=0.2)
54
+
55
+ return billsum
56
+
57
+ def prep_model():
58
+
59
+ billsum = prep_data()
60
+
61
+ tokenized_billsum = billsum.map(preprocess_function, batched=True)
62
+
63
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
64
+
65
+ model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
66
+
67
+ training_args = Seq2SeqTrainingArguments(
68
+ output_dir=output_dir,
69
+ evaluation_strategy="epoch",
70
+ learning_rate=2e-5,
71
+ per_device_train_batch_size=16,
72
+ per_device_eval_batch_size=16,
73
+ weight_decay=0.01,
74
+ save_total_limit=3,
75
+ num_train_epochs=30,
76
+ predict_with_generate=True,
77
+ fp16=True,
78
+ push_to_hub=True,
79
+ )
80
+
81
+ trainer = Seq2SeqTrainer(
82
+ model=model,
83
+ args=training_args,
84
+ train_dataset=tokenized_billsum["train"],
85
+ eval_dataset=tokenized_billsum["test"],
86
+ tokenizer=tokenizer,
87
+ data_collator=data_collator,
88
+ compute_metrics=compute_metrics,
89
+ )
90
+
91
+ return trainer
92
+
93
+ def train_model(trainer):
94
+
95
+ trainer.train()
96
+
97
+ trainer.save_model(output_dir)
98
+
99
+ trainer.push_to_hub()
100
+
101
+ def prep_pipeline():
102
+ summarizer = pipeline("summarization", model=f"suneeln-duke/{output_dir}")
103
+
104
+ return summarizer
105
+
106
+ def gen_summary(summarizer, text):
107
+ summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
108
+
109
+ return summary
scripts/text_gen/story_gen.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import langchain.document_loaders
2
+
3
+ from langchain.document_loaders import DirectoryLoader, PyPDFLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.schema import Document
6
+ from langchain.embeddings import OpenAIEmbeddings
7
+ from langchain.vectorstores.chroma import Chroma
8
+ import os
9
+ import shutil
10
+
11
+ from langchain.vectorstores.chroma import Chroma
12
+ from langchain.embeddings import OpenAIEmbeddings
13
+ from langchain.chat_models import ChatOpenAI
14
+ from langchain.prompts import ChatPromptTemplate
15
+
16
+
17
+ def get_chunks(file_path):
18
+
19
+ loader = PyPDFLoader(file_path)
20
+
21
+ documents = loader.load()
22
+
23
+ text_splitter = RecursiveCharacterTextSplitter(
24
+ chunk_size=300,
25
+ chunk_overlap=100,
26
+ length_function=len,
27
+ add_start_index=True,
28
+ )
29
+
30
+ chunks = text_splitter.split_documents(documents)
31
+
32
+ return chunks
33
+
34
+ def get_vectordb(chunks):
35
+
36
+ db = Chroma.from_documents(chunks, embedding_function=OpenAIEmbeddings())
37
+
38
+ # if os.path.exists(CHROMA_PATH):
39
+ # db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
40
+
41
+ # else:
42
+ # db = Chroma.from_documents(
43
+ # chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
44
+ # )
45
+
46
+ # db.persist()
47
+
48
+ # print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
49
+
50
+ return db
51
+
52
+
53
+ def gen_sample(text, decision, db):
54
+
55
+ PROMPT_TEMPLATE = """
56
+ Answer the question based only on the following context:
57
+
58
+ {context}
59
+
60
+ ---
61
+
62
+ Answer the question based on the above context: {question}
63
+ """
64
+
65
+ query_text = f"""
66
+
67
+ Act as the author of a Choose Your Own Adventure Book. This book is special as it is based on existing material.
68
+ Now, as with any choose your own adventure book, there are inifinite paths based on the choices a user makes.
69
+ Given some relevant text and the decision taken with respect to the relevant text, generate the next part of the story.
70
+ It should be within 6-8 sentences and be coherent as it were actually part of the story.
71
+
72
+ Relevant: {text}
73
+
74
+ Decision: {decision}
75
+
76
+ """
77
+
78
+ results = db.similarity_search_with_relevance_scores(query_text, k=5)
79
+
80
+ context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
81
+ prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
82
+ prompt = prompt_template.format(context=context_text, question=query_text)
83
+
84
+ model = ChatOpenAI()
85
+ response_text = model.predict(prompt)
86
+
87
+ return eval(response_text)