Manasa1 commited on
Commit
8ddbef4
·
verified ·
1 Parent(s): 9002cc2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -158
app.py CHANGED
@@ -1,169 +1,100 @@
1
- from dotenv import load_dotenv
2
  import streamlit as st
3
  from langchain_community.document_loaders import UnstructuredPDFLoader
4
- from langchain_text_splitters.character import CharacterTextSplitter
5
- from langchain_community.vectorstores import FAISS
6
- from langchain_community.embeddings import HuggingFaceEmbeddings
7
- from langchain_groq import ChatGroq
8
- from langchain.memory import ConversationBufferMemory
9
- from langchain.chains import ConversationalRetrievalChain
10
- from nltk import word_tokenize, FreqDist, sent_tokenize
11
- from textblob import TextBlob
12
- from sklearn.feature_extraction.text import TfidfVectorizer
13
- from sklearn.decomposition import NMF
14
  import os
15
- import nltk
16
- nltk.download('punkt')
17
-
18
- # Load secret API key
19
- secret = os.getenv('Groq_api')
20
- working_dir = os.path.dirname(os.path.abspath(__file__))
21
-
22
- def load_documents(file_path):
23
- loader = UnstructuredPDFLoader(file_path)
24
- documents = loader.load()
25
- return documents
26
-
27
- def setup_vectorstore(documents):
28
- embeddings = HuggingFaceEmbeddings()
29
- text_splitter = CharacterTextSplitter(
30
- separator="\n",
31
- chunk_size=500,
32
- chunk_overlap=100
33
- )
34
- doc_chunks = text_splitter.split_documents(documents)
35
- vectorstores = FAISS.from_documents(doc_chunks, embeddings)
36
- return vectorstores
37
-
38
- def extract_mentality_traits(documents):
39
- # Extract the text from each document
40
- text = "\n".join([doc.page_content for doc in documents])
41
- blob = TextBlob(text)
42
-
43
- # Analyze tone
44
- sentiment = blob.sentiment.polarity
45
- tone = "neutral"
46
- if sentiment > 0.2:
47
- tone = "positive"
48
- elif sentiment < -0.2:
49
- tone = "negative"
50
-
51
- # Common phrases and reasoning patterns
52
- words = word_tokenize(text.lower())
53
- fdist = FreqDist(words)
54
- common_phrases = [word for word, count in fdist.most_common(10) if len(word) > 3]
55
-
56
- # Topic Modeling to identify key viewpoints
57
- vectorizer = TfidfVectorizer(stop_words='english')
58
- doc_term_matrix = vectorizer.fit_transform([text])
59
- nmf = NMF(n_components=2, random_state=1)
60
- nmf.fit(doc_term_matrix)
61
- topics = []
62
- for topic in nmf.components_:
63
- topic_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]]
64
- topics.append(", ".join(topic_words))
65
-
66
- mentality_traits = {
67
- "tone": tone,
68
- "common_phrases": common_phrases,
69
- "reasoning_pattern": "deductive" if sentiment > 0 else "inductive",
70
- "key_viewpoints": topics
71
- }
72
- return mentality_traits
73
-
74
- from langchain.prompts import PromptTemplate
75
-
76
- from langchain.prompts import PromptTemplate
77
- from langchain.chains import LLMChain
78
-
79
- from langchain.prompts import PromptTemplate
80
- from langchain.chains import LLMChain
81
-
82
- def create_chain(vectorstores, mentality_traits):
83
- # Create style prompt dynamically based on mentality traits
84
- style_prompt = (
85
- f"Think and respond in the same mindset as the person from the uploaded document. "
86
- f"The author's tone is generally {mentality_traits['tone']}. "
87
- f"Their reasoning style is mostly {mentality_traits['reasoning_pattern']} and they often discuss themes such as {', '.join(mentality_traits['key_viewpoints'])}. "
88
- f"Use phrases like {', '.join(mentality_traits['common_phrases'])}. "
89
- f"Respond as if you are the author, mirroring their values, reasoning style, and mentality in all replies."
90
- )
91
-
92
- # Create a custom prompt template for conversational retrieval
93
- prompt = PromptTemplate(input_variables=["question"], template=style_prompt + "\nQuestion: {question}\nAnswer:")
94
-
95
- llm = ChatGroq(
96
- api_key=secret,
97
- model="llama-3.1-70b-versatile",
98
- temperature=0.7
99
- )
100
-
101
- # Create the LLM chain that includes the custom style prompt
102
- llm_chain = LLMChain(llm=llm, prompt=prompt)
103
-
104
- retriever = vectorstores.as_retriever()
105
- memory = ConversationBufferMemory(
106
- llm=llm,
107
- output_key="answer",
108
- memory_key="chat_history",
109
- return_messages=True
110
- )
111
-
112
- # Now, create the ConversationalRetrievalChain without the `prompt`
113
- chain = ConversationalRetrievalChain.from_llm(
114
- llm=llm_chain, # Use the LLMChain here, not the LLM directly
115
- retriever=retriever,
116
- memory=memory,
117
- verbose=True
118
- )
119
-
120
- return chain
121
-
122
- # Streamlit setup
123
  st.set_page_config(
124
- page_title="Chat with your documents",
125
- page_icon="📑",
126
  layout="centered"
127
  )
 
128
 
129
- st.title("📝Chat With Your Docs 😎")
130
-
131
- if "chat_history" not in st.session_state:
132
- st.session_state.chat_history = []
133
 
134
- uploaded_file = st.file_uploader(label="Upload your PDF")
 
 
 
 
 
135
 
136
- if uploaded_file:
137
- file_path = f"{working_dir}/{uploaded_file.name}"
138
- with open(file_path, "wb") as f:
139
  f.write(uploaded_file.getbuffer())
140
 
141
- # Load and process the PDF document
142
- documents = load_documents(file_path)
143
- if "vectorstores" not in st.session_state:
144
- st.session_state.vectorstores = setup_vectorstore(documents)
145
-
146
- # Extract mentality traits and create the conversational chain with mental adaptation
147
- mentality_traits = extract_mentality_traits(documents)
148
- if "conversation_chain" not in st.session_state:
149
- st.session_state.conversation_chain = create_chain(st.session_state.vectorstores, mentality_traits)
150
-
151
- for message in st.session_state.chat_history:
152
- with st.chat_message(message["role"]):
153
- st.markdown(message["content"])
154
-
155
- user_input = st.chat_input("Ask any questions relevant to uploaded PDF")
156
-
157
- if user_input:
158
- st.session_state.chat_history.append({"role": "user", "content": user_input})
159
- with st.chat_message("user"):
160
- st.markdown(user_input)
161
-
162
- # Use the conversation chain to generate the response
163
- response = st.session_state.conversation_chain({"question": user_input})
164
-
165
- # Ensure that assistant_response is cast as a string
166
- assistant_response = str(response.get("answer", "Sorry, I couldn't generate a response."))
167
- st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
168
- with st.chat_message("assistant"):
169
- st.markdown(assistant_response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from langchain_community.document_loaders import UnstructuredPDFLoader
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline
4
+ import torch
 
 
 
 
 
 
 
 
5
  import os
6
+ from datasets import Dataset
7
+ import pandas as pd
8
+ import re
9
+
10
+ # Set up page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  st.set_page_config(
12
+ page_title="Tweet Style Cloning",
13
+ page_icon="🐦",
14
  layout="centered"
15
  )
16
+ st.title("🐦 Clone Tweet Style from PDF")
17
 
18
+ # Step 1: Upload PDF
19
+ uploaded_file = st.file_uploader("Upload a PDF with tweets")
 
 
20
 
21
+ if uploaded_file is not None:
22
+ # Step 2: Extract text from PDF
23
+ def load_pdf_text(file_path):
24
+ loader = UnstructuredPDFLoader(file_path)
25
+ documents = loader.load()
26
+ return " ".join([doc.page_content for doc in documents])
27
 
28
+ # Save the uploaded PDF file temporarily
29
+ with open("uploaded_tweets.pdf", "wb") as f:
 
30
  f.write(uploaded_file.getbuffer())
31
 
32
+ # Extract text from PDF
33
+ extracted_text = load_pdf_text("uploaded_tweets.pdf")
34
+
35
+ # Step 3: Preprocess text to separate each tweet (assuming tweets end with newline)
36
+ tweets = re.split(r'\n+', extracted_text)
37
+ tweets = [tweet.strip() for tweet in tweets if len(tweet.strip()) > 0]
38
+
39
+ # Display a few sample tweets for verification
40
+ st.write("Sample Tweets Extracted:")
41
+ st.write(tweets[:5])
42
+
43
+ # Step 4: Fine-tune a model on the extracted tweets
44
+ def fine_tune_model(tweets):
45
+ # Convert tweets to a DataFrame and Dataset
46
+ df = pd.DataFrame(tweets, columns=["text"])
47
+ tweet_dataset = Dataset.from_pandas(df)
48
+
49
+ # Load model and tokenizer
50
+ model_name = "gpt2" # Replace with a suitable model if needed
51
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
52
+ model = AutoModelForCausalLM.from_pretrained(model_name)
53
+
54
+ # Tokenize the dataset
55
+ def tokenize_function(examples):
56
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
57
+
58
+ tokenized_tweets = tweet_dataset.map(tokenize_function, batched=True)
59
+
60
+ # Training arguments
61
+ training_args = TrainingArguments(
62
+ output_dir="./fine_tuned_tweet_model",
63
+ per_device_train_batch_size=4,
64
+ num_train_epochs=3,
65
+ save_steps=10_000,
66
+ save_total_limit=1,
67
+ logging_dir='./logs',
68
+ )
69
+
70
+ # Initialize the Trainer
71
+ trainer = Trainer(
72
+ model=model,
73
+ args=training_args,
74
+ train_dataset=tokenized_tweets,
75
+ )
76
+
77
+ # Fine-tune the model
78
+ trainer.train()
79
+
80
+ # Save the fine-tuned model
81
+ model.save_pretrained("fine_tuned_tweet_model")
82
+ tokenizer.save_pretrained("fine_tuned_tweet_model")
83
+
84
+ return model, tokenizer
85
+
86
+ # Trigger fine-tuning and notify user
87
+ with st.spinner("Fine-tuning model..."):
88
+ model, tokenizer = fine_tune_model(tweets)
89
+ st.success("Model fine-tuned successfully!")
90
+
91
+ # Step 5: Set up text generation
92
+ tweet_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
93
+
94
+ # Generate a new tweet based on user input
95
+ prompt = st.text_input("Enter a prompt for a new tweet in the same style:")
96
+ if prompt:
97
+ with st.spinner("Generating tweet..."):
98
+ generated_tweet = tweet_generator(prompt, max_length=50, num_return_sequences=1)
99
+ st.write("Generated Tweet:")
100
+ st.write(generated_tweet[0]["generated_text"])