Spaces:

dengkane
/

learn-streamlit

Sleeping

dengkane commited on Aug 30, 2023

Commit

d995b49

•

1 Parent(s): 83db5d1

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,31 +1,29 @@
 import streamlit as st
 # To make things easier later, we're also importing numpy and pandas for
 # working with sample data.
-import numpy as np
-import pandas as pd
 import torch
 import faiss
-import numpy as np
-from transformers import AutoTokenizer, AutoModel
-# Load the embedding model and tokenizer
-model_name = "moka-ai/m3e-base"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModel.from_pretrained(model_name)
-# Generate some random text contents
-texts = ["This is the first document.", "This is the second document.", "And this is the third one.", "Is this the first document?"]
-# Convert the text contents to embeddings
-embeddings = []
-for text in texts:
-    input_ids = tokenizer.encode(text, return_tensors="pt")
-    with torch.no_grad():
-        embedding = model(input_ids)[0][0].numpy()
-        embeddings.append(embedding)
-embeddings = np.array(embeddings)
-# Create a Faiss index
 d = embeddings.shape[1]  # Dimension of the embeddings
 index = faiss.IndexFlatIP(d)  # Index that uses inner product (dot product) similarity
@@ -33,7 +31,7 @@ index = faiss.IndexFlatIP(d)  # Index that uses inner product (dot product) simi
 index.add(embeddings)
 # Search for similar documents
-query = "This is a new document."
 input_ids = tokenizer.encode(query, return_tensors="pt")
 with torch.no_grad():
     query_embedding = model(input_ids)[0][0].numpy()

 import streamlit as st
 # To make things easier later, we're also importing numpy and pandas for
 # working with sample data.
 import torch
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer('moka-ai/m3e-base')
+#Our sentences we like to encode
+sentences = [
+    '* Moka 此文本嵌入模型由 MokaAI 训练并开源，训练脚本使用 uniem',
+    '* Massive 此文本嵌入模型通过**千万级**的中文句对数据集进行训练',
+    '* Mixed 此文本嵌入模型支持中英双语的同质文本相似度计算，异质文本检索等功能，未来还会支持代码检索，ALL in one'
+]
+#Sentences are encoded by calling model.encode()
+embeddings = model.encode(sentences)
+#Print the embeddings
+for sentence, embedding in zip(sentences, embeddings):
+    print("Sentence:", sentence)
+    print("Embedding:", embedding)
+    print("")
 import faiss
 d = embeddings.shape[1]  # Dimension of the embeddings
 index = faiss.IndexFlatIP(d)  # Index that uses inner product (dot product) similarity
 index.add(embeddings)
 # Search for similar documents
+query = "训练脚本."
 input_ids = tokenizer.encode(query, return_tensors="pt")
 with torch.no_grad():
     query_embedding = model(input_ids)[0][0].numpy()