File size: 2,233 Bytes
01b8e8e
 
5692cb3
 
 
01b8e8e
 
 
 
 
39503cb
01b8e8e
39503cb
01b8e8e
 
 
 
 
 
 
39503cb
27e0350
01b8e8e
27e0350
6a6afbf
27e0350
 
01b8e8e
 
 
39503cb
01b8e8e
 
 
 
 
101be32
01b8e8e
101be32
 
5692cb3
 
 
 
 
 
6bb1fd5
5692cb3
6bb1fd5
 
cfc1673
6bb1fd5
 
42468fb
5692cb3
 
 
42468fb
101be32
39503cb
 
 
101be32
39503cb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import uuid

from haystack.document_stores import BaseDocumentStore
from haystack.schema import Document


def format_docs(documents):
    """Given a list of documents, format the documents and return the documents and doc ids."""
    db_docs: list = []
    for doc in documents:
        doc_id = doc["id"] if doc["id"] is not None else str(uuid.uuid4())
        db_doc = {
            "content": doc["text"],
            "content_type": "text",
            "id": str(uuid.uuid4()),
            "meta": {"id": doc_id},
        }
        db_docs.append(Document(**db_doc))
    return db_docs, [doc.meta["id"] for doc in db_docs]


def index(documents, pipeline, clear_index=True):
    documents, doc_ids = format_docs(documents)
    if clear_index:
        document_stores = pipeline.get_nodes_by_class(class_type=BaseDocumentStore)
        for docstore in document_stores:
            docstore.delete_index(docstore.index)
    pipeline.run(documents=documents)
    return doc_ids


def search(queries, pipeline):
    results = []
    matches_queries = pipeline.run_batch(queries=queries)
    for matches in matches_queries["documents"]:
        query_results = []
        score_is_empty = False
        for res in matches:
            if not score_is_empty:
                score_is_empty = True if res.score is None else False

            # Get the original text from content or meta
            original_text = res.content
            if hasattr(res, "meta") and "content_text" in res.meta:
                original_text = res.meta["content_text"]

            match = {
                "text": original_text,
                "id": res.meta["id"],
                "fragment_id": res.id,
                "meta": res.meta,
            }
            if not score_is_empty:
                match.update({"score": res.score})
            if res.content_type == "audio":
                # Add audio path from the content field
                match.update({"content_audio": res.content})
            query_results.append(match)
        if not score_is_empty:
            query_results = sorted(
                query_results, key=lambda x: x["score"], reverse=True
            )
        results.append(query_results)
    return results