File size: 5,088 Bytes
cff2190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import dotenv
import gradio as gr
import lancedb
import logging
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.llms import Cohere
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import LanceDB
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Assume these loaders are implemented based on your specific requirements
from custom_document_loaders import TextLoader, PyPDFLoader, DocxLoader, ImageLoader
import argostranslate.package
import argostranslate.translate
import shutil

# Configuration and Logging
dotenv.load_dotenv(".env")
DB_PATH = "/tmp/lancedb"
COHERE_MODEL_NAME = "multilingual-22-12"
LANGUAGE_ISO_CODES = {"English": "en", "Hindi": "hi", "Turkish": "tr", "French": "fr"}

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize argostranslate
argostranslate.package.update_package_index()

def initialize_documents_and_embeddings(input_file_path):
    logger.info(f"Processing file: {input_file_path}")
    file_extension = os.path.splitext(input_file_path)[1].lower()
    loader = None

    if file_extension in [".txt"]:
        loader = TextLoader(input_file_path)
    elif file_extension in [".pdf"]:
        loader = PyPDFLoader(input_file_path)
    elif file_extension in [".doc", ".docx"]:
        loader = DocxLoader(input_file_path)
    elif file_extension in [".jpg", ".jpeg", ".png"]:
        loader = ImageLoader(input_file_path)
    else:
        raise ValueError("Unsupported file type. Supported files are .txt, .pdf, .docx, and image files.")

    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
    texts = text_splitter.split_documents(documents)
    embeddings = CohereEmbeddings(model=COHERE_MODEL_NAME)
    return texts, embeddings

def initialize_database(texts, embeddings):
    if os.path.exists(DB_PATH):
        shutil.rmtree(DB_PATH)  # Ensure a fresh start
    db = lancedb.connect(DB_PATH)
    table = db.create_table("multiling-rag", mode="overwrite")
    return LanceDB.from_documents(texts, embeddings, connection=table)

def translate_text(text, from_code, to_code):
    installed_languages = argostranslate.translate.get_installed_languages()
    from_lang = next((lang for lang in installed_languages if lang.code == from_code), None)
    to_lang = next((lang for lang in installed_languages if lang.code == to_code), None)
    if not from_lang or not to_lang:
        logger.error("Translation languages not installed.")
        return "Translation error"
    translation = from_lang.get_translation(to_lang)
    return translation.translate(text)

def answer_question(question, input_language, output_language, db):
    try:
        input_lang_code = LANGUAGE_ISO_CODES[input_language]
        output_lang_code = LANGUAGE_ISO_CODES[output_language]
        question_in_english = translate_text(question, input_lang_code, "en") if input_language != "English" else question
        # Simplified retrieval and response logic for demonstration
        response = "This is a simulated response based on the question."
        result_in_target_language = translate_text(response, "en", output_lang_code) if output_language != "English" else response
        return result_in_target_language
    except Exception as e:
        logger.error(f"Error in answer_question: {str(e)}")
        return "An error occurred while processing your question."

def document_analysis_and_feedback(document_path, feedback):
    # Placeholder for document analysis logic
    response = "Document analysis and feedback processing is not fully implemented."
    return response

def setup_gradio_interface(db):
    with gr.Blocks() as demo:
        gr.Markdown("# Multilingual Health and Wellness Chatbot")
        with gr.Tab("Ask a Question"):
            with gr.Row():
                input_language = gr.Dropdown(list(LANGUAGE_ISO_CODES.keys()), label="Input Language")
                output_language = gr.Dropdown(list(LANGUAGE_ISO_CODES.keys()), label="Output Language")
            question = gr.Textbox(label="Your question")
            answer = gr.Textbox(label="Answer")
            question.submit(lambda q, i, o: answer_question(q, i, o, db), inputs=[question, input_language, output_language], outputs=answer)
        
        with gr.Tab("Upload Document"):
            with gr.Row():
                document = gr.File(label="Upload your health document")
                feedback_box = gr.Textbox(label="Feedback (optional)")
            upload_response = gr.Textbox(label="Analysis Result")
            document.submit(document_analysis_and_feedback, inputs=[document, feedback_box], outputs=upload_response)

    return demo

def main():
    INPUT_FILE_PATH = "sample-text.txt"  # Placeholder file path
    texts, embeddings = initialize_documents_and_embeddings(INPUT_FILE_PATH)
    db = initialize_database(texts, embeddings)
    demo = setup_gradio_interface(db)
    demo.launch()

if __name__ == "__main__":
    main()