Spaces:

HumbleBeeAI
/

al-ghazali-rag-retrieval-api

Running

App Files Files Community

eli02 commited on 20 days ago

Commit

8c23c78

1 Parent(s): 2d6fe2b

Remove unused parquet file and update requirements with specific package versions for better dependency management

Browse files

Files changed (3) hide show

[openai_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet → [all_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet +2 -2
main.py +265 -91
requirements.txt +14 -8

[openai_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet → [all_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9f94d381f4dfcff0bbf6bfa5c84def47794d1596e12e2204a2a4bb413fc25a05
-size 2257769

 version https://git-lfs.github.com/spec/v1
+oid sha256:ced650f23166f55939fb6dfec6df2fd7d83995a9db362a1a7460d36e6f3ab510
+size 3118786

main.py CHANGED Viewed

@@ -7,11 +7,17 @@ from jose import JWTError, jwt
 from datetime import datetime, timedelta
 from openai import OpenAI
 from pathlib import Path
-from typing import List
 import pandas as pd
 import os
 import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -19,6 +25,9 @@ logging.basicConfig(level=logging.INFO)
 # Initialize FastAPI app
 app = FastAPI()
 # JWT Configuration
 SECRET_KEY = os.environ.get("prime_auth", "c0369f977b69e717dc16f6fc574039eb2b1ebde38014d2be")
 REFRESH_SECRET_KEY = os.environ.get("prolonged_auth", "916018771b29084378c9362c0cd9e631fd4927b8aea07f91")
@@ -26,27 +35,141 @@ ALGORITHM = "HS256"
 ACCESS_TOKEN_EXPIRE_MINUTES = 30
 REFRESH_TOKEN_EXPIRE_DAYS = 7
-# OAuth2 scheme for token authentication
 oauth2_scheme = OAuth2PasswordBearer(tokenUrl="login")
-# Load credentials from environment variables
 def load_credentials():
     credentials = {}
-    for i in range(1, 51):  # Assuming you have 50 credentials
         username = os.environ.get(f"login_{i}")
         password = os.environ.get(f"password_{i}")
         if username and password:
             credentials[username] = password
     return credentials
-# Authenticate user and create token
 def authenticate_user(username: str, password: str):
     credentials_dict = load_credentials()
     if username in credentials_dict and credentials_dict[username] == password:
         return username
     return None
-# Create JWT token
 def create_token(data: dict, expires_delta: timedelta, secret_key: str):
     to_encode = data.copy()
     expire = datetime.utcnow() + expires_delta
@@ -54,7 +177,6 @@ def create_token(data: dict, expires_delta: timedelta, secret_key: str):
     encoded_jwt = jwt.encode(to_encode, secret_key, algorithm=ALGORITHM)
     return encoded_jwt
-# Verify JWT token
 def verify_token(token: str, secret_key: str):
     credentials_exception = HTTPException(
         status_code=status.HTTP_401_UNAUTHORIZED,
@@ -70,71 +192,20 @@ def verify_token(token: str, secret_key: str):
         raise credentials_exception
     return username
-# Verify access token
 def verify_access_token(token: str = Depends(oauth2_scheme)):
     return verify_token(token, SECRET_KEY)
-# Verify refresh token
-def verify_refresh_token(token: str):
-    return verify_token(token, REFRESH_SECRET_KEY)
-# Load data from parquet file
-def load_data(database_file):
-    df = pd.read_parquet(database_file)
-    return df
-# Generate OpenAI embeddings
-def generate_openai_embeddings(client, text):
-    response = client.embeddings.create(
-        input=text,
-        model="text-embedding-3-small"
-    )
-    return response.data[0].embedding
-# Compute cosine similarity
-def cosine_similarity(embedding_0, embedding_1):
-    dot_product = sum(a * b for a, b in zip(embedding_0, embedding_1))
-    norm_0 = sum(a * a for a in embedding_0) ** 0.5
-    norm_1 = sum(b * b for b in embedding_1) ** 0.5
-    return dot_product / (norm_0 * norm_1)
-# Search query
-def search_query(client, query, df, n=3):
-    embedding = generate_openai_embeddings(client, query)
-    df['similarities'] = df.openai_embedding.apply(lambda x: cosine_similarity(x, embedding))
-    res = df.sort_values('similarities', ascending=False).head(n)
-    return res
-# Pydantic model for the query input
-class QueryInput(BaseModel):
-    query: str
-# Pydantic model for the search result
-class SearchResult(BaseModel):
-    text: str
-    similarity: float
-# Pydantic model for the token response
-class TokenResponse(BaseModel):
-    access_token: str
-    refresh_token: str
-    token_type: str
-# Root endpoint
 @app.get("/")
 def index() -> FileResponse:
     file_path = Path(__file__).parent / "static" / "index.html"
     return FileResponse(path=str(file_path), media_type="text/html")
-# Login endpoint to issue tokens
 @app.post("/login", response_model=TokenResponse)
 def login(form_data: OAuth2PasswordRequestForm = Depends()):
-    logging.info("Login attempt for user: %s", form_data.username)
     username = authenticate_user(form_data.username, form_data.password)
     if not username:
-        logging.warning("Authentication failed for user: %s", form_data.username)
         raise HTTPException(
             status_code=status.HTTP_401_UNAUTHORIZED,
             detail="Invalid username or password",
@@ -142,47 +213,150 @@ def login(form_data: OAuth2PasswordRequestForm = Depends()):
         )
     access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
     refresh_token_expires = timedelta(days=REFRESH_TOKEN_EXPIRE_DAYS)
-    access_token = create_token(data={"sub": username}, expires_delta=access_token_expires, secret_key=SECRET_KEY)
-    refresh_token = create_token(data={"sub": username}, expires_delta=refresh_token_expires, secret_key=REFRESH_SECRET_KEY)
-    logging.info("Tokens issued for user: %s", username)
-    return {"access_token": access_token, "refresh_token": refresh_token, "token_type": "bearer"}
-# Refresh token endpoint
 @app.post("/refresh", response_model=TokenResponse)
-def refresh(refresh_token: str):
-    username = verify_refresh_token(refresh_token)
-    access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
-    access_token = create_token(data={"sub": username}, expires_delta=access_token_expires, secret_key=SECRET_KEY)
-    return {"access_token": access_token, "refresh_token": refresh_token, "token_type": "bearer"}
-# Search endpoint
 @app.post("/search", response_model=List[SearchResult])
-def search(
     query_input: QueryInput,
     username: str = Depends(verify_access_token),
 ):
-    # Initialize OpenAI client
-    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
-    # Load database
-    database_file = Path(__file__).parent / "[openai_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet"
-    df = load_data(database_file)
-    logging.info("Database loaded successfully")
-    # Perform search
-    res = search_query(client, query_input.query, df, n=3)
-    # Format results
-    results = [
-        SearchResult(text=row["ext"], similarity=row["similarities"])
-        for _, row in res.iterrows()
-    ]
-    return results
 app.mount("/home", StaticFiles(directory="static", html=True), name="home")
-# Run the app
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from datetime import datetime, timedelta
 from openai import OpenAI
 from pathlib import Path
+from typing import List, Optional, Dict
+from datasets import Dataset, load_dataset
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import login
 import pandas as pd
+import numpy as np
+import torch as t
 import os
 import logging
+from functools import lru_cache
+from diskcache import Cache
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 # Initialize FastAPI app
 app = FastAPI()
+# Initialize disk cache
+cache = Cache('./cache')
 # JWT Configuration
 SECRET_KEY = os.environ.get("prime_auth", "c0369f977b69e717dc16f6fc574039eb2b1ebde38014d2be")
 REFRESH_SECRET_KEY = os.environ.get("prolonged_auth", "916018771b29084378c9362c0cd9e631fd4927b8aea07f91")
 ACCESS_TOKEN_EXPIRE_MINUTES = 30
 REFRESH_TOKEN_EXPIRE_DAYS = 7
 oauth2_scheme = OAuth2PasswordBearer(tokenUrl="login")
+# Pydantic models
+class QueryInput(BaseModel):
+    query: str
+class SearchResult(BaseModel):
+    text: str
+    similarity: float
+    model_type: str
+class TokenResponse(BaseModel):
+    access_token: str
+    refresh_token: str
+    token_type: str
+class SaveInput(BaseModel):
+    user_type: str
+    username: str
+    query: str
+    retrieved_text: str
+    model_type: str
+    reaction: str
+class SaveBatchInput(BaseModel):
+    items: List[SaveInput]
+class RefreshRequest(BaseModel):
+    refresh_token: str
+# Cache management
+@lru_cache(maxsize=1)
+def get_sentence_transformer():
+    """Load and cache the SentenceTransformer model with lru_cache"""
+    return SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cpu")
+def get_cached_embeddings(text: str, model_type: str) -> Optional[List[float]]:
+    """Try to get embeddings from cache"""
+    cache_key = f"{model_type}_{hash(text)}"
+    return cache.get(cache_key)
+def set_cached_embeddings(text: str, model_type: str, embeddings: List[float]):
+    """Store embeddings in cache"""
+    cache_key = f"{model_type}_{hash(text)}"
+    cache.set(cache_key, embeddings, expire=86400)  # Cache for 24 hours
+@lru_cache(maxsize=1)
+def load_dataframe():
+    """Load and cache the parquet dataframe"""
+    database_file = Path(__file__).parent / "[all_embedded] The Alchemy of Happiness (Ghazzālī, Claud Field) (Z-Library).parquet"
+    return pd.read_parquet(database_file)
+# Utility functions
+def cosine_similarity(embedding_0, embedding_1):
+    dot_product = sum(a * b for a, b in zip(embedding_0, embedding_1))
+    norm_0 = sum(a * a for a in embedding_0) ** 0.5
+    norm_1 = sum(b * b for b in embedding_1) ** 0.5
+    return dot_product / (norm_0 * norm_1)
+def generate_embedding(model, text: str, model_type: str) -> List[float]:
+    # Try to get from cache first
+    cached_embedding = get_cached_embeddings(text, model_type)
+    if cached_embedding is not None:
+        return cached_embedding
+    # Generate new embedding if not in cache
+    if model_type == "all-mpnet-base-v2":
+        chunk_embedding = model.encode(
+            text,
+            convert_to_tensor=True
+        )
+        embedding = np.array(t.Tensor.cpu(chunk_embedding)).tolist()
+    elif model_type == "openai":
+        response = model.embeddings.create(
+            input=text,
+            model="text-embedding-3-small"
+        )
+        embedding = response.data[0].embedding
+    # Cache the new embedding
+    set_cached_embeddings(text, model_type, embedding)
+    return embedding
+def search_query(client, st_model, query: str, df: pd.DataFrame, n: int = 1) -> List[Dict]:
+    # Generate embeddings for both models
+    mpnet_embedding = generate_embedding(st_model, query, "all-mpnet-base-v2")
+    openai_embedding = generate_embedding(client, query, "openai")
+    # Calculate similarities
+    df['mpnet_similarities'] = df.all_mpnet_embedding.apply(
+        lambda x: cosine_similarity(x, mpnet_embedding)
+    )
+    df['openai_similarities'] = df.openai_embedding.apply(
+        lambda x: cosine_similarity(x, openai_embedding)
+    )
+    # Get top results for each model
+    mpnet_results = df.nlargest(n, 'mpnet_similarities')
+    openai_results = df.nlargest(n, 'openai_similarities')
+    # Format results
+    results = []
+    for _, row in mpnet_results.iterrows():
+        results.append({
+            "text": row["ext"],
+            "similarity": float(row["mpnet_similarities"]),
+            "model_type": "all-mpnet-base-v2"
+        })
+    for _, row in openai_results.iterrows():
+        results.append({
+            "text": row["ext"],
+            "similarity": float(row["openai_similarities"]),
+            "model_type": "openai"
+        })
+    return results
+# Authentication functions
 def load_credentials():
     credentials = {}
+    for i in range(1, 51):
         username = os.environ.get(f"login_{i}")
         password = os.environ.get(f"password_{i}")
         if username and password:
             credentials[username] = password
     return credentials
 def authenticate_user(username: str, password: str):
     credentials_dict = load_credentials()
     if username in credentials_dict and credentials_dict[username] == password:
         return username
     return None
 def create_token(data: dict, expires_delta: timedelta, secret_key: str):
     to_encode = data.copy()
     expire = datetime.utcnow() + expires_delta
     encoded_jwt = jwt.encode(to_encode, secret_key, algorithm=ALGORITHM)
     return encoded_jwt
 def verify_token(token: str, secret_key: str):
     credentials_exception = HTTPException(
         status_code=status.HTTP_401_UNAUTHORIZED,
         raise credentials_exception
     return username
 def verify_access_token(token: str = Depends(oauth2_scheme)):
     return verify_token(token, SECRET_KEY)
+# Endpoints
 @app.get("/")
 def index() -> FileResponse:
+    """Serve the custom HTML page from the static directory"""
     file_path = Path(__file__).parent / "static" / "index.html"
     return FileResponse(path=str(file_path), media_type="text/html")
 @app.post("/login", response_model=TokenResponse)
 def login(form_data: OAuth2PasswordRequestForm = Depends()):
     username = authenticate_user(form_data.username, form_data.password)
     if not username:
         raise HTTPException(
             status_code=status.HTTP_401_UNAUTHORIZED,
             detail="Invalid username or password",
         )
     access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
     refresh_token_expires = timedelta(days=REFRESH_TOKEN_EXPIRE_DAYS)
+    access_token = create_token(
+        data={"sub": username},
+        expires_delta=access_token_expires,
+        secret_key=SECRET_KEY
+    )
+    refresh_token = create_token(
+        data={"sub": username},
+        expires_delta=refresh_token_expires,
+        secret_key=REFRESH_SECRET_KEY
+    )
+    return {
+        "access_token": access_token,
+        "refresh_token": refresh_token,
+        "token_type": "bearer"
+    }
 @app.post("/refresh", response_model=TokenResponse)
+async def refresh(refresh_request: RefreshRequest):
+    """
+    Endpoint to refresh an access token using a valid refresh token.
+    Returns a new access token and the existing refresh token.
+    """
+    try:
+        # Verify the refresh token
+        username = verify_token(refresh_request.refresh_token, REFRESH_SECRET_KEY)
+        # Create new access token
+        access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
+        access_token = create_token(
+            data={"sub": username},
+            expires_delta=access_token_expires,
+            secret_key=SECRET_KEY
+        )
+        return {
+            "access_token": access_token,
+            "refresh_token": refresh_request.refresh_token,  # Return the same refresh token
+            "token_type": "bearer"
+        }
+    except JWTError:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Could not validate credentials",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
 @app.post("/search", response_model=List[SearchResult])
+async def search(
     query_input: QueryInput,
     username: str = Depends(verify_access_token),
 ):
+    try:
+        # Initialize clients using cached functions
+        client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+        st_model = get_sentence_transformer()
+        df = load_dataframe()
+        # Perform search with both models
+        results = search_query(client, st_model, query_input.query, df, n=1)
+        return [SearchResult(**result) for result in results]
+    except Exception as e:
+        logging.error(f"Search error: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Search failed: {str(e)}"
+        )
+@app.post("/save")
+async def save_data(
+    save_input: SaveBatchInput,
+    username: str = Depends(verify_access_token)
+):
+    try:
+        # Login to Hugging Face
+        hf_token = os.environ.get("al_ghazali_rag_retrieval_evaluation")
+        if not hf_token:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="Hugging Face API token not found"
+            )
+        login(token=hf_token)
+        # Prepare data for saving
+        data = {
+            "user_type": [],
+            "username": [],
+            "query": [],
+            "retrieved_text": [],
+            "model_type": [],
+            "reaction": []
+        }
+        # Add each item to the data dict
+        for item in save_input.items:
+            data["user_type"].append(item.user_type)
+            data["username"].append(item.username)
+            data["query"].append(item.query)
+            data["retrieved_text"].append(item.retrieved_text)
+            data["model_type"].append(item.model_type)
+            data["reaction"].append(item.reaction)
+        try:
+            # Load existing dataset and merge
+            dataset = load_dataset(
+                "HumbleBeeAI/al-ghazali-rag-retrieval-evaluation",
+                split="train"
+            )
+            existing_data = dataset.to_dict()
+            # Add new data
+            for key in data:
+                if key not in existing_data:
+                    existing_data[key] = ["" if key in ["username", "model_type"] else None] * len(next(iter(existing_data.values())))
+                existing_data[key].extend(data[key])
+        except Exception as e:
+            logging.warning(f"Could not load existing dataset, creating new one: {str(e)}")
+            existing_data = data
+        # Create and push dataset
+        updated_dataset = Dataset.from_dict(existing_data)
+        updated_dataset.push_to_hub(
+            "HumbleBeeAI/al-ghazali-rag-retrieval-evaluation"
+        )
+        return {"message": "Data saved successfully"}
+    except Exception as e:
+        logging.error(f"Save error: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to save data: {str(e)}"
+        )
+# Make sure to keep the static files mounting
 app.mount("/home", StaticFiles(directory="static", html=True), name="home")
+# Startup event to create cache directory if it doesn't exist
+@app.on_event("startup")
+async def startup_event():
+    os.makedirs("./cache", exist_ok=True)
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt CHANGED Viewed

@@ -1,8 +1,14 @@
-fastapi
-uvicorn
-pandas
-openai
-python-dotenv
-pyarrow
-python-jose[cryptography]
-python-multipart

+fastapi==0.109.2
+uvicorn==0.27.1
+python-jose==3.3.0
+python-multipart==0.0.6  # Required for OAuth2 form handling
+pydantic==2.6.1
+openai==1.12.0
+pandas==2.2.0
+numpy==1.26.3
+torch==2.1.2  # For sentence-transformers
+sentence-transformers==2.3.1
+datasets==2.17.0
+huggingface-hub==0.20.3
+diskcache==5.6.3
+python-dotenv==1.0.1  # For environment variable management