Stock-Sentiment-Analysis / py /handle_files.py
Karthikeyen92's picture
Update py/handle_files.py
439d5b6 verified
from datetime import datetime
import json
import os
import pickle
from typing import List
from langchain.schema import Document
import pandas as pd
def create_files(social_media_data, hugg = False):
folder_path = 'Stock Sentiment Analysis/files'
if hugg:
folder_path = 'files'
if not os.path.exists(folder_path):
os.makedirs(folder_path)
# Save dictionary to a file
with open(folder_path+'/social_media_data.json', 'w') as f:
json.dump(social_media_data, f)
# Convert the data to a pandas DataFrame
df = pd.DataFrame(social_media_data)
df.head()
# Exporting the data to a CSV file
file_path = folder_path+"/social_media_data.csv"
df.to_csv(file_path, index=False)
df.to_pickle(folder_path+"/social_media_data.pkl")
def fetch_social_media_data(hugg = False):
file_path = 'Stock Sentiment Analysis/files/social_media_data.json'
if hugg:
file_path = 'files/social_media_data.json'
with open(file_path, 'r') as file:
data = json.load(file)
social_media_document = []
for item in data:
social_media_document.append(Document(
page_content=str(item["page_content"]),
metadata={"platform":item["platform"],
"company":item["company"],
"ingestion_timestamp":datetime.now().isoformat(),
"word_count":len(item["page_content"]["content"]),
"link":item["link"] if "link" in item else ""
}))
return social_media_document
def save_ingested_data(ingested_data):
# Save the list to a file
with open('Stock Sentiment Analysis/files/ingested_data.pkl', 'wb') as file:
pickle.dump(ingested_data, file)
def save_analysed_data(analysed_data):
# Save the list to a file
with open('Stock Sentiment Analysis/files/analysed_data.pkl', 'wb') as file:
pickle.dump(analysed_data, file)
def get_ingested_data():
# Load the list from the file
with open('Stock Sentiment Analysis/files/ingested_data.pkl', 'rb') as file:
loaded_documents = pickle.load(file)
return loaded_documents
def get_analysed_data():
# Load the list from the file
with open('Stock Sentiment Analysis/files/analysed_data.pkl', 'rb') as file:
loaded_documents = pickle.load(file)
return loaded_documents
def sample_documents(documents: List[Document], n: int) -> List[Document]:
"""
Samples `n` entries for each unique `"platform"` and `"company"` metadata combination from the input `Document[]`.
Args:
documents (List[Document]): The input list of `Document` objects.
n (int): The number of entries to sample for each unique metadata combination.
Returns:
List[Document]: A new list of `Document` objects, with `n` entries per unique metadata combination.
"""
# Create a dictionary to store the sampled documents per metadata combination
sampled_docs = {}
for doc in documents:
combo = (doc.metadata["platform"], doc.metadata["company"])
if combo not in sampled_docs:
sampled_docs[combo] = []
# Add the document to the list for its metadata combination, up to n entries
if len(sampled_docs[combo]) < n:
sampled_docs[combo].append(doc)
# Flatten the dictionary into a single list
return [doc for docs in sampled_docs.values() for doc in docs]
def to_documents(data) -> List[Document]:
social_media_document = []
for item in data:
social_media_document.append(Document(
page_content=str(item["page_content"]),
metadata={"platform":item["platform"],
"company":item["company"],
"ingestion_timestamp":datetime.now().isoformat(),
"word_count":len(item["page_content"]["content"]),
"link": item["link"] if "link" in item else ""
}))
return social_media_document