VDNT11 commited on
Commit
72a8f7d
·
verified ·
1 Parent(s): d7a1308

Upload 5 files

Browse files
Files changed (4) hide show
  1. Dockerfile +30 -0
  2. app.py +93 -0
  3. index.html +180 -0
  4. requirements.txt +8 -0
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ git \
8
+ build-essential \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Copy requirements first to leverage Docker cache
12
+ COPY requirements.txt .
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Copy application code
16
+ COPY . .
17
+
18
+ # Clone and install IndicTransToolkit
19
+ RUN git clone https://github.com/VarunGumma/IndicTransToolkit \
20
+ && cd IndicTransToolkit \
21
+ && pip install --editable ./
22
+
23
+ # Create necessary directories
24
+ RUN mkdir -p templates
25
+
26
+ # Expose the port the app runs on
27
+ EXPOSE 7860
28
+
29
+ # Command to run the application
30
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, render_template, send_from_directory
2
+ from PIL import Image
3
+ import torch
4
+ from transformers import BlipProcessor, BlipForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer
5
+ from gtts import gTTS
6
+ import os
7
+ import soundfile as sf
8
+ from transformers import VitsTokenizer, VitsModel, set_seed
9
+ from IndicTransToolkit import IndicProcessor
10
+
11
+ # Initialize Flask app
12
+ app = Flask(__name__)
13
+ UPLOAD_FOLDER = "./static/uploads/"
14
+ AUDIO_FOLDER = "./static/audio/"
15
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
16
+ os.makedirs(AUDIO_FOLDER, exist_ok=True)
17
+ app.config["UPLOAD_FOLDER"] = UPLOAD_FOLDER
18
+ app.config["AUDIO_FOLDER"] = AUDIO_FOLDER
19
+
20
+ # Load models
21
+ blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
22
+ blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda" if torch.cuda.is_available() else "cpu")
23
+ model_name = "ai4bharat/indictrans2-en-indic-1B"
24
+ tokenizer_IT2 = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
25
+ model_IT2 = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)
26
+ model_IT2 = torch.quantization.quantize_dynamic(
27
+ model_IT2, {torch.nn.Linear}, dtype=torch.qint8
28
+ )
29
+ model_IT2.to("cuda" if torch.cuda.is_available() else "cpu")
30
+ ip = IndicProcessor(inference=True)
31
+
32
+ # Functions
33
+ def generate_caption(image_path):
34
+ image = Image.open(image_path).convert("RGB")
35
+ inputs = blip_processor(image, "image of", return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
36
+ with torch.no_grad():
37
+ generated_ids = blip_model.generate(**inputs)
38
+ return blip_processor.decode(generated_ids[0], skip_special_tokens=True)
39
+
40
+ def translate_caption(caption, target_languages):
41
+ src_lang = "eng_Latn"
42
+ input_sentences = [caption]
43
+ translations = {}
44
+
45
+ for tgt_lang in target_languages:
46
+ batch = ip.preprocess_batch(input_sentences, src_lang=src_lang, tgt_lang=tgt_lang)
47
+ inputs = tokenizer_IT2(batch, truncation=True, padding="longest", return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
48
+ with torch.no_grad():
49
+ generated_tokens = model_IT2.generate(
50
+ **inputs, min_length=0, max_length=256, num_beams=5, num_return_sequences=1
51
+ )
52
+ with tokenizer_IT2.as_target_tokenizer():
53
+ translated_tokens = tokenizer_IT2.batch_decode(generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
54
+ translations[tgt_lang] = ip.postprocess_batch(translated_tokens, lang=tgt_lang)[0]
55
+ return translations
56
+
57
+ def generate_audio_gtts(text, lang_code, output_file):
58
+ tts = gTTS(text=text, lang=lang_code)
59
+ tts.save(output_file)
60
+ return output_file
61
+
62
+ @app.route("/", methods=["GET", "POST"])
63
+ def index():
64
+ if request.method == "POST":
65
+ image_file = request.files.get("image")
66
+ if image_file:
67
+ image_path = os.path.join(app.config["UPLOAD_FOLDER"], image_file.filename)
68
+ image_file.save(image_path)
69
+
70
+ caption = generate_caption(image_path)
71
+ target_languages = request.form.getlist("languages")
72
+ translations = translate_caption(caption, target_languages)
73
+
74
+ audio_files = {}
75
+ lang_codes = {
76
+ "hin_Deva": "hi", "guj_Gujr": "gu", "urd_Arab": "ur", "mar_Deva": "mr"
77
+ }
78
+ for lang, translation in translations.items():
79
+ lang_code = lang_codes.get(lang, "en")
80
+ audio_file_path = os.path.join(app.config["AUDIO_FOLDER"], f"{lang}.mp3")
81
+ audio_files[lang] = generate_audio_gtts(translation, lang_code, audio_file_path)
82
+
83
+ return render_template(
84
+ "index.html", image_path=image_path, caption=caption, translations=translations, audio_files=audio_files
85
+ )
86
+ return render_template("index.html")
87
+
88
+ @app.route("/audio/<filename>")
89
+ def audio(filename):
90
+ return send_from_directory(app.config["AUDIO_FOLDER"], filename)
91
+
92
+ if __name__ == "__main__":
93
+ app.run(host="0.0.0.0", port=7860)
index.html ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Multilingual Assistive Model</title>
7
+ </head>
8
+ <body>
9
+ <h1>Multilingual Assistive Model</h1>
10
+ <form action="/" method="POST" enctype="multipart/form-data">
11
+ <label for="image">Upload an Image:</label>
12
+ <input type="file" id="image" name="image" required>
13
+ <br><br>
14
+ <label>Select Target Languages:</label><br>
15
+ <input type="checkbox" name="languages" value="hin_Deva"> Hindi<br>
16
+ <input type="checkbox" name="languages" value="mar_Deva"> Marathi<br>
17
+ <input type="checkbox" name="languages" value="guj_Gujr"> Gujarati<br>
18
+ <input type="checkbox" name="languages" value="urd_Arab"> Urdu<br>
19
+ <button type="submit">Submit</button>
20
+ </form>
21
+
22
+ {% if caption %}
23
+ <h2>Uploaded Image</h2>
24
+ <img src="{{ image_path }}" alt="Uploaded Image" style="max-width: 300px;">
25
+ <h2>Caption: {{ caption }}</h2>
26
+ <h2>Translations:</h2>
27
+ <ul>
28
+ {% for lang, translation in translations.items() %}
29
+ <li>{{ lang }}: {{ translation }}
30
+ <audio controls>
31
+ <source src="/audio/{{ lang }}.mp3" type="audio/mpeg">
32
+ </audio>
33
+ </li>
34
+ {% endfor %}
35
+ </ul>
36
+ {% endif %}
37
+ </body>
38
+ </html> -->
39
+
40
+
41
+ <!DOCTYPE html>
42
+ <html>
43
+ <head>
44
+ <title>Multilingual Assistive Model</title>
45
+ <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
46
+ </head>
47
+ <body class="bg-gray-100 p-8">
48
+ <div class="max-w-4xl mx-auto">
49
+ <h1 class="text-3xl font-bold mb-8">Multilingual Assistive Model</h1>
50
+
51
+ <div class="bg-white p-6 rounded-lg shadow-lg">
52
+ <form id="uploadForm" class="space-y-4">
53
+ <div>
54
+ <label class="block text-sm font-medium mb-2">Upload Image</label>
55
+ <input type="file" name="image" accept="image/*" required
56
+ class="w-full p-2 border rounded">
57
+ </div>
58
+
59
+ <div>
60
+ <label class="block text-sm font-medium mb-2">Select Languages</label>
61
+ <div class="space-y-2">
62
+ <label class="inline-flex items-center">
63
+ <input type="checkbox" name="languages[]" value="hin_Deva" checked
64
+ class="form-checkbox">
65
+ <span class="ml-2">Hindi</span>
66
+ </label>
67
+ <br>
68
+ <label class="inline-flex items-center">
69
+ <input type="checkbox" name="languages[]" value="mar_Deva" checked
70
+ class="form-checkbox">
71
+ <span class="ml-2">Marathi</span>
72
+ </label>
73
+ <br>
74
+ <label class="inline-flex items-center">
75
+ <input type="checkbox" name="languages[]" value="guj_Gujr"
76
+ class="form-checkbox">
77
+ <span class="ml-2">Gujarati</span>
78
+ </label>
79
+ <br>
80
+ <label class="inline-flex items-center">
81
+ <input type="checkbox" name="languages[]" value="urd_Arab"
82
+ class="form-checkbox">
83
+ <span class="ml-2">Urdu</span>
84
+ </label>
85
+ </div>
86
+ </div>
87
+
88
+ <button type="submit"
89
+ class="w-full bg-blue-500 text-white py-2 px-4 rounded hover:bg-blue-600">
90
+ Process Image
91
+ </button>
92
+ </form>
93
+
94
+ <div id="results" class="mt-8 hidden">
95
+ <div id="imagePreview" class="mb-4"></div>
96
+
97
+ <div class="space-y-4">
98
+ <div>
99
+ <h2 class="text-xl font-semibold mb-2">Caption:</h2>
100
+ <p id="caption" class="text-gray-700"></p>
101
+ </div>
102
+
103
+ <div>
104
+ <h2 class="text-xl font-semibold mb-2">Translations:</h2>
105
+ <div id="translations" class="space-y-2"></div>
106
+ </div>
107
+ </div>
108
+ </div>
109
+
110
+ <div id="loading" class="hidden mt-4">
111
+ <p class="text-center text-gray-600">Processing... Please wait.</p>
112
+ </div>
113
+ </div>
114
+ </div>
115
+
116
+ <script>
117
+ document.getElementById('uploadForm').addEventListener('submit', async (e) => {
118
+ e.preventDefault();
119
+
120
+ const form = e.target;
121
+ const formData = new FormData(form);
122
+
123
+ // Show loading
124
+ document.getElementById('loading').classList.remove('hidden');
125
+ document.getElementById('results').classList.add('hidden');
126
+
127
+ try {
128
+ const response = await fetch('/process', {
129
+ method: 'POST',
130
+ body: formData
131
+ });
132
+
133
+ const data = await response.json();
134
+
135
+ // Display results
136
+ document.getElementById('caption').textContent = data.caption;
137
+
138
+ const translationsDiv = document.getElementById('translations');
139
+ translationsDiv.innerHTML = '';
140
+
141
+ for (const [lang, translation] of Object.entries(data.translations)) {
142
+ const div = document.createElement('div');
143
+ div.className = 'mb-4';
144
+ div.innerHTML = `
145
+ <h3 class="font-medium">${lang}:</h3>
146
+ <p class="text-gray-700">${translation}</p>
147
+ <audio controls src="/audio/${lang}" class="mt-2"></audio>
148
+ `;
149
+ translationsDiv.appendChild(div);
150
+ }
151
+
152
+ // Show results
153
+ document.getElementById('results').classList.remove('hidden');
154
+ } catch (error) {
155
+ console.error('Error:', error);
156
+ alert('An error occurred while processing the image.');
157
+ } finally {
158
+ document.getElementById('loading').classList.add('hidden');
159
+ }
160
+ });
161
+
162
+ // Image preview
163
+ document.querySelector('input[type="file"]').addEventListener('change', (e) => {
164
+ const file = e.target.files[0];
165
+ if (file) {
166
+ const reader = new FileReader();
167
+ reader.onload = (e) => {
168
+ const img = document.createElement('img');
169
+ img.src = e.target.result;
170
+ img.className = 'max-w-full h-auto rounded';
171
+ const previewDiv = document.getElementById('imagePreview');
172
+ previewDiv.innerHTML = '';
173
+ previewDiv.appendChild(img);
174
+ };
175
+ reader.readAsDataURL(file);
176
+ }
177
+ });
178
+ </script>
179
+ </body>
180
+ </html>
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ flask==2.0.1
2
+ Pillow==9.3.0
3
+ torch==2.0.0
4
+ transformers==4.28.0
5
+ gTTS==2.3.1
6
+ soundfile==0.12.1
7
+ numpy==1.24.2
8
+ scipy==1.10.1