File size: 7,069 Bytes
17c6318
30ffb9e
 
 
 
 
 
 
 
 
 
17c6318
 
30ffb9e
 
 
17c6318
 
30ffb9e
 
17c6318
30ffb9e
 
17c6318
30ffb9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17c6318
 
30ffb9e
 
17c6318
30ffb9e
 
 
fc26027
 
30ffb9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17c6318
30ffb9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc26027
30ffb9e
 
 
17c6318
30ffb9e
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
from modal import App, Volume, Image, Mount

from typing import List, Dict, Tuple, Union, Callable
# from preprocessing import FileIO

# assets = modal.Mount.from_local_dir(
#     "./data",
#     # condition=lambda pth: not ".venv" in pth,
#     remote_path="./data",
# )

app = App("vector-search-project")
vector_search = Image.debian_slim().pip_install(
    "sentence_transformers==2.2.2", "llama_index==0.9.6.post1", "angle_emb==0.1.5"
    )

vol = Volume.from_name("vector-search-volume")
# ^ volume must be created manually with CLI: modal volume create vector-search-volume 


@app.function(image=vector_search, 
               gpu="A100", 
               timeout=600,
               volumes={"/root/models": vol}
               # secrets are available in the environment with os.environ["SECRET_NAME"]
               # secret=modal.Secret.from_name("my-huggingface-secret")
               )
def encode_content_splits(content_splits,
                            model=None,  # path or name of model
                            **kwargs
                            ):
    """ kwargs provided in case encode method has extra arguments """
    from sentence_transformers import SentenceTransformer
    
    import os, time
    models_list = os.listdir('/root/models')
    print("Models:", models_list)
    
    if isinstance(model, str) and model[-1] == "/":
        model = model[:-1]
        
    if isinstance(model, str):
        model = model.split('/')[-1]
    
    if isinstance(model, str) and model in models_list:
        
        if "UAE-Large-V1-300" in model:
            print("Loading finetuned UAE-Large-V1-300 model from Modal Volume")
            
            from angle_emb import AnglE 
            model = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1',
                                          pretrained_model_path=os.path.join('/root/models', model), 
                                          pooling_strategy='cls').cuda()
            kwargs['to_numpy'] = True
            
            # this model doesn't accept list of lists
            if isinstance(content_splits[0], list):
                content_splits = [chunk for episode in content_splits for chunk in episode]

        else:
            print(f"Loading model {model} from Modal volume")
            model = SentenceTransformer(os.path.join('/root/models', model))
            
    elif isinstance(model, str):
        if model in models_list:
            print(f"Loading model {model} from Modal volume")
            model = SentenceTransformer(os.path.join('/root/models', model))
        else:
            print(f"Model {model} not found in Modal volume, loading from HuggingFace")
            model = SentenceTransformer(model)
            
    else:
        print(f"Using model provided as argument")
        if 'save' in kwargs:
            if isinstance(kwargs['save'], str) and kwargs['save'][-1] == '/':
                kwargs['save'] = kwargs['save'][:-1]
            kwargs['save'] = kwargs['save'].split('/')[-1]
            fname = os.path.join('/root/models',  kwargs['save'])
            print(f"Saving model in {fname}")
            # model.save(fname)
            print(f"Model saved in {fname}")
            kwargs.pop('save')
        
    print("Starting encoding")
    start = time.perf_counter()

    emb = [list(zip(episode, model.encode(episode, **kwargs))) for episode in content_splits]
    end = time.perf_counter() - start
    print(f"GPU processing lasted {end:.2f} seconds")
    print("Encoding finished")
    
    return emb


@app.function(image=vector_search, gpu="A100", timeout=240,
               mounts=[Mount.from_local_dir("./data",
                                                  remote_path="/root/data", 
                                                  condition=lambda pth: ".json" in pth)],
               volumes={"/root/models": vol}
)
def finetune(training_path='./data/training_data_300.json', 
             valid_path='./data/validation_data_100.json', 
             model_id=None,
             ignore_existing=False):

    import os
    print("Data:", os.listdir('/root/data'))
    print("Models:", os.listdir('/root/models'))
    
    if model_id is None:
        print("No model ID provided")
        return None
    elif isinstance(model_id, str) and model_id[-1] == "/":
        model_id = model_id[:-1]

    
    from llama_index.finetuning import EmbeddingQAFinetuneDataset
    
    training_set = EmbeddingQAFinetuneDataset.from_json(training_path)
    valid_set = EmbeddingQAFinetuneDataset.from_json(valid_path)
    print("Datasets loaded")
    
    num_training_examples = len(training_set.queries)
    print(f"Training examples: {num_training_examples}")
    
    from llama_index.finetuning import SentenceTransformersFinetuneEngine

    print(f"Model Name is {model_id}")
    model_ext = model_id.split('/')[1]
        
    ft_model_name = f'finetuned-{model_ext}-{num_training_examples}'
    model_outpath = os.path.join("/root/models", ft_model_name)

    print(f'Model ID: {model_id}')
    print(f'Model Outpath: {model_outpath}')

    finetune_engine = SentenceTransformersFinetuneEngine(
        training_set,
        batch_size=32,
        model_id=model_id,
        model_output_path=model_outpath,
        val_dataset=valid_set,
        epochs=10
    )
    import io, os, zipfile, glob, time
    try:
        start = time.perf_counter()
        finetune_engine.finetune()
        end = time.perf_counter() - start
        print(f"GPU processing lasted {end:.2f} seconds")
        
        print(os.listdir('/root/models'))
        app.volume.commit()  # Persist changes, ie the finetumed model
        
        # TODO SHARE THE MODEL ON HUGGINGFACE
        # https://huggingface.co/docs/transformers/v4.15.0/model_sharing
        
        folder_to_zip = model_outpath
        # Zip the contents of the folder at 'folder_path' and return a BytesIO object.
        bytes_buffer = io.BytesIO()

        with zipfile.ZipFile(bytes_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
            for file_path in glob.glob(folder_to_zip + "/**", recursive=True):
                print(f"Processed file {file_path}")
                zip_file.write(file_path, os.path.relpath(file_path, start=folder_to_zip))

        # Move the pointer to the start of the BytesIO buffer before returning
        bytes_buffer.seek(0)
        # You can now return this zipped_folder object, write it to a file, send it over a network, etc.
        # Replace with the path to the folder you want to zip
        zippedio = bytes_buffer
        
        return zippedio
    except Exception:
        return "Finetuning failed"
    
    
@app.local_entrypoint()
def test_method(content_splits=[["a"]]):
    output = encode_content_splits.remote(content_splits)
    return output
  
# deploy it with
# modal token set --token-id ak-xxxxxx --token-secret as-xxxxx # given when we create a new token
# modal deploy podcast/1/backend.py
# View Deployment: https://modal.com/apps/jpbianchi/falcon_hackaton-project <<< use this project name