Spaces:

suneeln-duke
/

nexusai-v3

Runtime error

App Files Files Community

suneeln-duke commited on Apr 23, 2024

Commit

6c57304

1 Parent(s): cc84091

f

Browse files

Files changed (22) hide show

.gitignore +1 -0
Dockerfile +23 -0
data/dune.pdf +0 -0
main.py +56 -0
models/dec_clf/nlp.h5 +3 -0
models/dec_clf/tokenizer.pkl +3 -0
requirements.txt +18 -0
scripts/decision_clf/__pycache__/falcon_clf.cpython-311.pyc +0 -0
scripts/decision_clf/__pycache__/rag_clf.cpython-311.pyc +0 -0
scripts/decision_clf/__pycache__/seq_clf.cpython-311.pyc +0 -0
scripts/decision_clf/falcon_clf.py +291 -0
scripts/decision_clf/rag_clf.py +116 -0
scripts/decision_clf/seq_clf.py +172 -0
scripts/path_gen/paths_gen.py +94 -0
scripts/summarization/__pycache__/falcon_summ.cpython-311.pyc +0 -0
scripts/summarization/__pycache__/rag_summ.cpython-311.pyc +0 -0
scripts/summarization/__pycache__/t5_summ.cpython-311.pyc +0 -0
scripts/summarization/falcon_summ.py +12 -0
scripts/summarization/graph_summ.py +0 -0
scripts/summarization/rag_summ.py +103 -0
scripts/summarization/t5_summ.py +109 -0
scripts/text_gen/story_gen.py +87 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM python:3.11.2
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN apt update && apt install -y ffmpeg
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+ENV H2O_WAVE_LISTEN=":7860"
+ENV H2O_WAVE_ADDRESS="http://127.0.0.1:7860"
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

data/dune.pdf ADDED Viewed

Binary file (333 kB). View file

main.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from fastapi import FastAPI
+import os
+from langchain.document_loaders import DirectoryLoader
+import wandb
+import huggingface_hub
+from scripts.summarization import falcon_summ
+from scripts.decision_clf import seq_clf
+from scripts.path_gen import paths_gen
+from scripts.text_gen import story_gen
+app = FastAPI()
+wandb.login(key = os.getenv('wandb_key'))
+huggingface_hub.login(token = os.getenv('hf_key'))
+os.environ['OPENAI_API_KEY'] = os.getenv('openapi_key')
+summarizer = falcon_summ.prep_pipeline()
+token_path= 'models/dec_clf/tokenizer.pkl'
+model_path = 'models/dec_clf/nlp.h5'
+chunks = paths_gen.get_chunks("data/dune.pdf")
+db = paths_gen.get_vectordb(chunks)
+@app.get("/hello")
+def hello():
+    return {"message": "Hello World"}
+@app.post("/summ")
+def summ(text: str):
+    return { "summary": falcon_summ.gen_summary(summarizer, text)}
+@app.post("/clf")
+def clf(text: str):
+    return {"decision": seq_clf.predict(text, model_path, token_path)}
+@app.post("/gen_path")
+def gen_path(text: str):
+    return paths_gen.gen_sample(text, db)
+@app.post("/gen_story")
+def gen_story(text: str, decision: str):
+    return story_gen.gen_sample(text, decision, db)

models/dec_clf/nlp.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36e39f6a5bd037d9180031701950af7caedd3c27e44d68d6f4ee2361fbdc41df
+size 8117880

models/dec_clf/tokenizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a432c203344b2bc0ac2c6fe4e0f50bf2003ea4b0b715f50a8a999a38907f11b8
+size 307491

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+fastapi
+uvicorn
+pandas
+chromadb==0.3.29
+keras==2.13.1
+tensorflow==2.13.0
+torch==2.0.0
+wandb
+openapi
+langchain==0.1.16
+langchain-community==0.0.34
+langchain-core==0.1.45
+langchain-text-splitters==0.0.1
+tiktoken
+langchain
+unstructured==0.13.3
+transformers==4.36.1
+scikit-learn==1.4.2

scripts/decision_clf/__pycache__/falcon_clf.cpython-311.pyc ADDED Viewed

Binary file (12.3 kB). View file

scripts/decision_clf/__pycache__/rag_clf.cpython-311.pyc ADDED Viewed

Binary file (4.78 kB). View file

scripts/decision_clf/__pycache__/seq_clf.cpython-311.pyc ADDED Viewed

Binary file (8.12 kB). View file

scripts/decision_clf/falcon_clf.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import os
+import numpy as np
+import pandas as pd
+import os
+from tqdm import tqdm
+from transformers import pipeline
+from transformers import AutoTokenizer, FalconForCausalLM
+import torch
+from datasets import Dataset
+from peft import LoraConfig
+from trl import SFTTrainer
+from transformers import (
+                          AutoTokenizer,
+                          BitsAndBytesConfig,
+                          TrainingArguments,
+                          pipeline,
+                          )
+from sklearn.metrics import (accuracy_score,
+                             classification_report,
+                             confusion_matrix)
+from sklearn.model_selection import train_test_split
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import warnings
+warnings.filterwarnings("ignore")
+def generate_prompt(data_point):
+    return f"""### Instruction:
+            Classify whether the given chunk involves a decision that will effect the story or not.
+            A decision is defined as when the character goes about making a choice between two or more options.
+            The decision should be significant enough to affect the story in a major way.
+            It doesn't really involve emotions, feelings or thoughts, but what the character does, or what happens to them.
+            This involes interactions between characters, or the character and the environment.
+            What isn't a decision is chunks describing the setting, or the character's thoughts or feelings.
+            Return the answer as the corresponding decision label "yes" or "no"
+            ### Text:
+            {data_point["text"]}
+            ### Decision:
+            {data_point["decision"]}
+            """
+def generate_test_prompt(data_point):
+    return f"""### Instruction:
+            Classify whether the given chunk involves a decision that will effect the story or not.
+            A decision is defined as when the character goes about making a choice between two or more options.
+            The decision should be significant enough to affect the story in a major way.
+            It doesn't really involve emotions, feelings or thoughts, but what the character does, or what happens to them.
+            This involes interactions between characters, or the character and the environment.
+            What isn't a decision is chunks describing the setting, or the character's thoughts or feelings.
+            Return the answer as the corresponding decision label "yes" or "no"
+            ### Text:
+            {data_point["text"]}
+            ### Decision:
+            """
+def predict(X_test, model, tokenizer):
+    y_pred = []
+    for i in tqdm(range(len(X_test))):
+        prompt = X_test.iloc[i]["text"]
+        pipe = pipeline(task="text-generation",
+                        model=model,
+                        tokenizer=tokenizer,
+                        max_new_tokens = 1,
+                        temperature = 0.0,
+                       )
+        result = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)
+        answer = result[0]['generated_text'].split("=")[-1].lower()
+        if "yes" in answer:
+            y_pred.append("yes")
+        elif "no" in answer:
+            y_pred.append("no")
+        else:
+            y_pred.append("none")
+    return y_pred
+def evaluate(y_true, y_pred):
+    labels = ['yes', 'no', 'none']
+    mapping = {"yes": 1, "no": 0, 'none':2}
+    def map_func(x):
+        return mapping.get(x, 1)
+    y_true = np.vectorize(map_func)(y_true)
+    y_pred = np.vectorize(map_func)(y_pred)
+    # Calculate accuracy
+    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
+    print(f'Accuracy: {accuracy:.3f}')
+    # Generate accuracy report
+    unique_labels = set(y_true)  # Get unique labels
+    for label in unique_labels:
+        label_indices = [i for i in range(len(y_true))
+                         if y_true[i] == label]
+        label_y_true = [y_true[i] for i in label_indices]
+        label_y_pred = [y_pred[i] for i in label_indices]
+        accuracy = accuracy_score(label_y_true, label_y_pred)
+        print(f'Accuracy for label {label}: {accuracy:.3f}')
+    # Generate classification report
+    class_report = classification_report(y_true=y_true, y_pred=y_pred)
+    print('\nClassification Report:')
+    print(class_report)
+    # Generate confusion matrix
+    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
+    print('\nConfusion Matrix:')
+    print(conf_matrix)
+def prep_data():
+    filename = '../../data/output/decisions.csv'
+    df = pd.read_csv(filename, encoding="utf-8", encoding_errors="replace")
+    df = df[['text', 'decision']]
+    X_train = list()
+    X_test = list()
+    for decision in ["yes", "no"]:
+        train, test  = train_test_split(df[df.decision==decision],
+                                        train_size=.8,
+                                        test_size=.2,
+                                        random_state=42)
+        X_train.append(train)
+        X_test.append(test)
+    X_train = pd.concat(X_train).sample(frac=1, random_state=10)
+    X_test = pd.concat(X_test)
+    eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
+    X_eval = df[df.index.isin(eval_idx)]
+    X_eval = (X_eval
+            .groupby('decision', group_keys=False)
+            .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
+    X_train = X_train.reset_index(drop=True)
+    X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1),
+                       columns=["text"])
+    X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1),
+                        columns=["text"])
+    y_true = X_test.decision
+    X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])
+    train_data = Dataset.from_pandas(X_train)
+    eval_data = Dataset.from_pandas(X_eval)
+    return train_data, eval_data
+def prep_model():
+    model_name = "Rocketknight1/falcon-rw-1b"
+    compute_dtype = getattr(torch, "float16")
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=False,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=compute_dtype,
+    )
+    model = FalconForCausalLM.from_pretrained(
+        model_name,
+        device_map="auto",
+        quantization_config=bnb_config,
+    )
+    model.config.use_cache = False
+    model.config.pretraining_tp = 1
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                            trust_remote_code=True,
+                                            padding_side="left",
+                                            add_bos_token=True,
+                                            add_eos_token=True,
+                                            )
+    tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer
+def prep_trainer():
+    OUTPUT_DIR = "falcon-clf"
+    train_data, eval_data = prep_data()
+    model, tokenizer = prep_model()
+    peft_config = LoraConfig(
+        lora_alpha=16,
+        lora_dropout=0.1,
+        r=64,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+    training_arguments = TrainingArguments(
+        output_dir=OUTPUT_DIR,
+        num_train_epochs=20,
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=8, # 4
+        optim="paged_adamw_32bit",
+        save_steps=0,
+        logging_steps=10,
+        learning_rate=2e-4,
+        weight_decay=0.001,
+        fp16=True,
+        bf16=False,
+        max_grad_norm=0.3,
+        max_steps=-1,
+        warmup_ratio=0.03,
+        group_by_length=True,
+        lr_scheduler_type="cosine",
+        report_to="tensorboard",
+        evaluation_strategy="epoch"
+    )
+    trainer = SFTTrainer(
+        model=model,
+        train_dataset=train_data,
+        eval_dataset=eval_data,
+        peft_config=peft_config,
+        dataset_text_field="text",
+        tokenizer=tokenizer,
+        args=training_arguments,
+        packing=False,
+        max_seq_length=1024,
+    )
+    return trainer
+def train_model():
+    trainer = prep_trainer()
+    trainer.train()
+    trainer.model.save_pretrained("falcon-clf")
+    trainer.push_to_hub()
+def get_classifier():
+    classifier = pipeline(model=f"suneeln-duke/falcon-clf", device_map="auto")
+    return classifier
+def classify_dec(text, classifier):
+    text = generate_test_prompt({
+        'text': text
+    })
+    result = classifier(text, pad_token_id=classifier.tokenizer.eos_token_id)
+    answer = result[0]['generated_text'].split("=")[-1].lower()
+    if "yes" in answer:
+        return "yes"
+    elif "no" in answer:
+        return "no"

scripts/decision_clf/rag_clf.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import json
+import re
+import openai
+import langchain
+import langchain.document_loaders
+from langchain.document_loaders import DirectoryLoader, PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores.chroma import Chroma
+import os
+import shutil
+from langchain.vectorstores.chroma import Chroma
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import ChatPromptTemplate
+# read from config.ini file
+import PyPDF2
+def read_pages(pdf_file):
+    pages = []
+    reader = PyPDF2.PdfReader(pdf_file)
+    for page_number in range(len(reader.pages)):
+        page = reader.pages[page_number]
+        page_content = page.extract_text()
+        pages.append(page_content)
+    return pages
+def get_chunks(file_path):
+    loader = PyPDFLoader(file_path)
+    documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=300,
+    chunk_overlap=100,
+    length_function=len,
+    add_start_index=True,
+    )
+    chunks = text_splitter.split_documents(documents)
+    return chunks
+def get_vectordb(chunks, CHROMA_PATH):
+    CHROMA_PATH = f"../../data/chroma/{CHROMA_PATH}"
+    if os.path.exists(CHROMA_PATH):
+        db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
+    else:
+        db = Chroma.from_documents(
+            chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
+        )
+        db.persist()
+        print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
+    return db
+def classify_dec(text, db):
+    PROMPT_TEMPLATE = """
+    Answer the question based only on the following context:
+    {context}
+    ---
+    Answer the question based on the above context: {question}
+    """
+    query_text = f"""
+    Classify whether the given chunk involves a decision that will effect the story or not.
+    A decision is defined as when the character goes about making a choice between two or more options.
+    The decision should be significant enough to affect the story in a major way.
+    It doesn't really involve emotions, feelings or thoughts, but what the character does, or what happens to them.
+    This involes interactions between characters, or the character and the environment.
+    What isn't a decision is chunks describing the setting, or the character's thoughts or feelings.
+    Return the answer as the corresponding decision label "yes" or "no"
+    {text}
+    """
+    results = db.similarity_search_with_relevance_scores(query_text, k=5)
+    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
+    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
+    prompt = prompt_template.format(context=context_text, question=query_text)
+    model = ChatOpenAI()
+    response_text = model.predict(prompt)
+    return (response_text)

scripts/decision_clf/seq_clf.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import pandas as pd
+import numpy as np
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.optimizers import Adamax
+from tensorflow.keras.metrics import Precision, Recall
+from tensorflow.keras.layers import Dense, ReLU
+from tensorflow.keras.layers import Embedding, BatchNormalization, Concatenate
+from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dropout
+from tensorflow.keras.models import Sequential, Model
+from sklearn.preprocessing import LabelEncoder
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from keras.utils import to_categorical
+from sklearn.model_selection import train_test_split
+import pickle
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import pickle
+from tensorflow.keras.models import load_model
+def prep_data():
+    # Assuming df is your DataFrame and you want to split based on 'col' column
+    # You can adjust the test_size and val_size to change the split proportions
+    train_size = 0.9
+    test_size = 0.05
+    val_size = 0.05
+    df = pd.read_csv('../../data/output/decisions.csv')
+    df = df[['text', 'decision']]
+    # First split into train and (test + val)
+    df, test_val_df = train_test_split(df, test_size=(test_size + val_size), random_state=42)
+    # Then split test_val_df into test and validation sets
+    test_df, val_df = train_test_split(test_val_df, test_size=val_size/(test_size + val_size), random_state=42)
+    return df, test_df, val_df
+def split_data():
+    df, test_df, val_df = prep_data()
+    X_train = df['text']
+    y_train = df['decision']
+    X_test = test_df['text']
+    y_test = test_df['decision']
+    X_val = val_df['text']
+    y_val = val_df['decision']
+    encoder = LabelEncoder()
+    y_train = encoder.fit_transform(y_train)
+    y_val = encoder.transform(y_val)
+    y_test = encoder.transform(y_test)
+    mapping = dict(zip(encoder.classes_, range(len(encoder.classes_))))
+    return X_train, y_train, X_test, y_test, X_val, y_val, mapping
+def prep_model():
+    max_words = 10000
+    max_len = 50
+    embedding_dim = 32
+    # Branch 1
+    branch1 = Sequential()
+    branch1.add(Embedding(max_words, embedding_dim, input_length=max_len))
+    branch1.add(Conv1D(64, 3, padding='same', activation='relu'))
+    branch1.add(BatchNormalization())
+    branch1.add(ReLU())
+    branch1.add(Dropout(0.5))
+    branch1.add(GlobalMaxPooling1D())
+    # Branch 2
+    branch2 = Sequential()
+    branch2.add(Embedding(max_words, embedding_dim, input_length=max_len))
+    branch2.add(Conv1D(64, 3, padding='same', activation='relu'))
+    branch2.add(BatchNormalization())
+    branch2.add(ReLU())
+    branch2.add(Dropout(0.5))
+    branch2.add(GlobalMaxPooling1D())
+    concatenated = Concatenate()([branch1.output, branch2.output])
+    hid_layer = Dense(128, activation='relu')(concatenated)
+    dropout = Dropout(0.3)(hid_layer)
+    output_layer = Dense(2, activation='softmax')(dropout)
+    model = Model(inputs=[branch1.input, branch2.input], outputs=output_layer)
+    model.compile(optimizer='adamax',
+              loss='binary_crossentropy',
+              metrics=['accuracy', Precision(), Recall()])
+    return model
+def train_model():
+    X_train, y_train, X_test, y_test, X_val, y_val, mapping = split_data()
+    tokenizer = Tokenizer(num_words=10000)
+    tokenizer.fit_on_texts(X_train)
+    sequences = tokenizer.texts_to_sequences(X_train)
+    tr_x = pad_sequences(sequences, maxlen=50)
+    tr_y = to_categorical(y_train)
+    sequences = tokenizer.texts_to_sequences(X_val)
+    val_x = pad_sequences(sequences, maxlen=50)
+    val_y = to_categorical(y_val)
+    sequences = tokenizer.texts_to_sequences(X_test)
+    ts_x = pad_sequences(sequences, maxlen=50)
+    ts_y = to_categorical(y_test)
+    model = prep_model()
+    batch_size = 256
+    epochs = 100
+    history = model.fit([tr_x, tr_x], tr_y, epochs=epochs, batch_size=batch_size,
+                        validation_data=([val_x, val_x], val_y))
+    with open('../../data/models/dec_clf/tokenizer.pkl', 'wb') as tokenizer_file:
+        pickle.dump(tokenizer, tokenizer_file)
+    model.save('../../data/models/dec_clf/nlp.h5')
+def predict(text, model_path, token_path):
+    model = load_model(model_path)
+    with open(token_path, 'rb') as f:
+        tokenizer = pickle.load(f)
+    sequences = tokenizer.texts_to_sequences([text])
+    x_new = pad_sequences(sequences, maxlen=50)
+    predictions = model.predict([x_new, x_new])
+    mapping = {0: 'no', 1: 'yes'}
+    probs = list(predictions[0])
+    max_idx = np.argmax(probs)
+    return mapping[max_idx]

scripts/path_gen/paths_gen.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import langchain.document_loaders
+from langchain.document_loaders import DirectoryLoader, PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores.chroma import Chroma
+import os
+import shutil
+from langchain.vectorstores.chroma import Chroma
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import ChatPromptTemplate
+def get_chunks(file_path):
+    loader = PyPDFLoader(file_path)
+    documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=300,
+    chunk_overlap=100,
+    length_function=len,
+    add_start_index=True,
+    )
+    chunks = text_splitter.split_documents(documents)
+    return chunks
+def get_vectordb(chunks):
+    # CHROMA_PATH = f"../../chroma/{CHROMA_PATH}"
+    db = Chroma.from_documents(chunks, embedding_function=OpenAIEmbeddings())
+    # if os.path.exists(CHROMA_PATH):
+    #     db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
+    # else:
+    #     db = Chroma.from_documents(
+    #         chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
+    #     )
+    #     db.persist()
+    #     print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
+    return db
+def gen_sample(text, db):
+    PROMPT_TEMPLATE = """
+    Answer the question based only on the following context:
+    {context}
+    ---
+    Answer the question based on the above context: {question}
+    """
+    query_text = f"""
+    Act as the author of a Choose Your Own Adventure Book. This book is special as it is based on existing material.
+    Now, as with any choose your own adventure book, you'll have to generate decision paths at certain points in the story.
+    Your job is to generate 4 decision paths for the given point in the story, if applicable to that point in the story.
+    If the given part of the story doesn't contain any decisions from which to generate decision paths, don't
+    generate any. If the given part of the story contains a decision, generate 4 decision paths for that decision.
+    One among the 4 decision paths should be the original path, the other 3 should deviate from the original path in a sensible manner.
+    The decision paths should be generated in a way that they are coherent with the existing story.
+    The result should be a JSON object with the following keys: [text, paths]
+    text: The given text
+    paths: The generated decision paths as strings in a list
+    ```{text}```
+    """
+    results = db.similarity_search_with_relevance_scores(query_text, k=5)
+    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
+    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
+    prompt = prompt_template.format(context=context_text, question=query_text)
+    model = ChatOpenAI()
+    response_text = model.predict(prompt)
+    return eval(response_text)

scripts/summarization/__pycache__/falcon_summ.cpython-311.pyc ADDED Viewed

Binary file (866 Bytes). View file

scripts/summarization/__pycache__/rag_summ.cpython-311.pyc ADDED Viewed

Binary file (4.2 kB). View file

scripts/summarization/__pycache__/t5_summ.cpython-311.pyc ADDED Viewed

Binary file (5.36 kB). View file

scripts/summarization/falcon_summ.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from transformers import pipeline
+def prep_pipeline():
+     summarizer = pipeline("summarization", model=f"Falconsai/text_summarization")
+     return summarizer
+def gen_summary(summarizer, text):
+    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
+    return summary

scripts/summarization/graph_summ.py ADDED Viewed

File without changes

scripts/summarization/rag_summ.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores.chroma import Chroma
+import os
+from langchain.vectorstores.chroma import Chroma
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import ChatPromptTemplate
+import PyPDF2
+def read_pages(pdf_file):
+    pages = []
+    reader = PyPDF2.PdfReader(pdf_file)
+    for page_number in range(len(reader.pages)):
+        page = reader.pages[page_number]
+        page_content = page.extract_text()
+        pages.append(page_content)
+    return pages
+def get_chunks(file_path):
+    loader = PyPDFLoader(file_path)
+    documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=300,
+    chunk_overlap=100,
+    length_function=len,
+    add_start_index=True,
+    )
+    chunks = text_splitter.split_documents(documents)
+    return chunks
+def get_vectordb(chunks, CHROMA_PATH):
+    CHROMA_PATH = f"../../data/chroma/{CHROMA_PATH}"
+    if os.path.exists(CHROMA_PATH):
+        db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
+    else:
+        db = Chroma.from_documents(
+            chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
+        )
+        db.persist()
+        print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
+    return db
+def gen_summary(text, db):
+    PROMPT_TEMPLATE = """
+    Answer the question based only on the following context:
+    {context}
+    ---
+    Answer the question based on the above context: {question}
+    """
+    query_text = f"""
+    Summarize the given chunk from a story. The summary should be of narrartive nature and be around 5-7 sentences long.
+    ```{text}```
+    Generate response in the following JSON format:
+    {{
+        "summary": "Your summary here.",
+        "text: "The original text here."
+    }}
+    """
+    results = db.similarity_search_with_relevance_scores(query_text, k=5)
+    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
+    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
+    prompt = prompt_template.format(context=context_text, question=query_text)
+    model = ChatOpenAI()
+    response_text = model.predict(prompt)
+    return eval(response_text)

scripts/summarization/t5_summ.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from datasets import load_dataset
+from transformers import AutoTokenizer
+from transformers import DataCollatorForSeq2Seq
+from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+from transformers import pipeline
+checkpoint = "Falconsai/text_summarization"
+output_dir = "falcon-summ"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+import numpy as np
+import evaluate
+rouge = evaluate.load("rouge")
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
+    result["gen_len"] = np.mean(prediction_lens)
+    return {k: round(v, 4) for k, v in result.items()}
+def preprocess_function(examples, max_length=1024, max_target_length=128):
+    prefix = "summarize: "
+    inputs = [prefix + doc for doc in examples["text"]]
+    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
+    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+def prep_data():
+    billsum = load_dataset("billsum", split="ca_test")
+    billsum = billsum.train_test_split(test_size=0.2)
+    return billsum
+def prep_model():
+    billsum = prep_data()
+    tokenized_billsum = billsum.map(preprocess_function, batched=True)
+    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+    training_args = Seq2SeqTrainingArguments(
+    output_dir=output_dir,
+    evaluation_strategy="epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    weight_decay=0.01,
+    save_total_limit=3,
+    num_train_epochs=30,
+    predict_with_generate=True,
+    fp16=True,
+    push_to_hub=True,
+    )
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_billsum["train"],
+        eval_dataset=tokenized_billsum["test"],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+    )
+    return trainer
+def train_model(trainer):
+        trainer.train()
+        trainer.save_model(output_dir)
+        trainer.push_to_hub()
+def prep_pipeline():
+     summarizer = pipeline("summarization", model=f"suneeln-duke/{output_dir}")
+     return summarizer
+def gen_summary(summarizer, text):
+    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
+    return summary

scripts/text_gen/story_gen.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import langchain.document_loaders
+from langchain.document_loaders import DirectoryLoader, PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores.chroma import Chroma
+import os
+import shutil
+from langchain.vectorstores.chroma import Chroma
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import ChatPromptTemplate
+def get_chunks(file_path):
+    loader = PyPDFLoader(file_path)
+    documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=300,
+    chunk_overlap=100,
+    length_function=len,
+    add_start_index=True,
+    )
+    chunks = text_splitter.split_documents(documents)
+    return chunks
+def get_vectordb(chunks):
+    db = Chroma.from_documents(chunks, embedding_function=OpenAIEmbeddings())
+    # if os.path.exists(CHROMA_PATH):
+    #     db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())
+    # else:
+    #     db = Chroma.from_documents(
+    #         chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
+    #     )
+    #     db.persist()
+    #     print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
+    return db
+def gen_sample(text, decision, db):
+    PROMPT_TEMPLATE = """
+    Answer the question based only on the following context:
+    {context}
+    ---
+    Answer the question based on the above context: {question}
+    """
+    query_text = f"""
+    Act as the author of a Choose Your Own Adventure Book. This book is special as it is based on existing material.
+    Now, as with any choose your own adventure book, there are inifinite paths based on the choices a user makes.
+    Given some relevant text and the decision taken with respect to the relevant text, generate the next part of the story.
+    It should be within 6-8 sentences and be coherent as it were actually part of the story.
+    Relevant: {text}
+    Decision: {decision}
+    """
+    results = db.similarity_search_with_relevance_scores(query_text, k=5)
+    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
+    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
+    prompt = prompt_template.format(context=context_text, question=query_text)
+    model = ChatOpenAI()
+    response_text = model.predict(prompt)
+    return eval(response_text)