maykcaldas commited on
Commit
f274d93
1 Parent(s): 5216067

First commit

Browse files
Files changed (4) hide show
  1. agent.py +68 -0
  2. app.py +52 -0
  3. mapi_tools.py +215 -0
  4. utils.py +96 -0
agent.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mapi_tools import MAPI_class_tools, MAPI_reg_tools
2
+ from utils import common_tools
3
+ from langchain import OpenAI
4
+ from gpt_index import GPTListIndex, GPTIndexMemory
5
+ from langchain import agents
6
+ from langchain.agents import initialize_agent
7
+
8
+ stability = MAPI_class_tools(
9
+ "is_stable","stable","Stable","Unstable"
10
+ )
11
+ magnetism = MAPI_class_tools(
12
+ "is_magnetic","magnetic","Magnetic","Not magnetic"
13
+ )
14
+ metal = MAPI_class_tools(
15
+ "is_metal","metallic","Metal","Not metal"
16
+ )
17
+ gap_direct = MAPI_class_tools(
18
+ "is_gap_direct","gap direct","Gap direct","Gap indirect"
19
+ )
20
+ band_gap = MAPI_reg_tools(
21
+ "band_gap","band gap"
22
+ )
23
+ energy_per_atom = MAPI_reg_tools(
24
+ "energy_per_atom","energy per atom gap"
25
+ )
26
+ formation_energy_per_atom = MAPI_reg_tools(
27
+ "formation_energy_per_atom","formation energy per atom gap"
28
+ )
29
+ volume = MAPI_reg_tools(
30
+ "volume","volume"
31
+ )
32
+ density = MAPI_reg_tools(
33
+ "density","density"
34
+ )
35
+ atomic_density = MAPI_reg_tools(
36
+ "density_atomic","atomic density"
37
+ )
38
+ electronic_energy = MAPI_reg_tools(
39
+ "e_electronic","electronic energy"
40
+ )
41
+ ionic_energy = MAPI_reg_tools(
42
+ "e_ion","cationic energy"
43
+ )
44
+ total_energy = MAPI_reg_tools(
45
+ "e_total","total energy"
46
+ )
47
+
48
+
49
+ memory = GPTIndexMemory(index=GPTListIndex([]), memory_key="chat_history", query_kwargs={"response_mode": "compact"})
50
+ llm=OpenAI(temperature=0.7)
51
+ tools = (
52
+ stability.get_tools() +
53
+ magnetism.get_tools() +
54
+ gap_direct.get_tools() +
55
+ metal.get_tools() +
56
+ band_gap.get_tools() +
57
+ volume.get_tools() +
58
+ density.get_tools() +
59
+ atomic_density.get_tools() +
60
+ formation_energy_per_atom.get_tools() +
61
+ energy_per_atom.get_tools() +
62
+ electronic_energy.get_tools() +
63
+ ionic_energy.get_tools() +
64
+ total_energy.get_tools() +
65
+ agents.load_tools(["llm-math", "python_repl"], llm=llm) +
66
+ common_tools
67
+ )
68
+ agent_chain = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True, memory=memory)
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import agent
4
+ import os
5
+
6
+ css_style = """
7
+ .gradio-container {
8
+ font-family: "IBM Plex Mono";
9
+ }
10
+ """
11
+
12
+ def agent_run(q, openai_api_key, mapi_api_key):
13
+ os.environ["OPENAI_API_KEY"]=openai_api_key
14
+ os.environ["MAPI_API_KEY"]=mapi_api_key
15
+ try:
16
+ out = agent.agent_chain.run(input=q)
17
+ except:
18
+ out = "Something went wrong, please try again"
19
+ return out
20
+
21
+ with gr.Blocks(css=css_style) as demo:
22
+ gr.Markdown(f'''
23
+ # A LLM application developed during the LLM March *MADNESS* Hackathon
24
+ - Developed by: Mayk Caldas ([@maykcaldas](https://github.com/maykcaldas)) and Sam Cox ([@SamCox822](https://github.com/SamCox822))
25
+
26
+ ## What is this?
27
+ - This is a demo of a LLM agent that can answer questions about materials science using the [LangChain🦜️🔗](https://github.com/hwchase17/langchain/) and the [Materials Project API](https://materialsproject.org/).
28
+ - Its behave is based on Large Language Models (LLM) and aim to be a tool to help scientists with quick predictions of a nunerous of properties of materials.
29
+ It is a work in progress, so please be patient with it.
30
+
31
+
32
+ ### Some keys are needed in order to use it:
33
+ 1. An openAI API key ( [Check it here](https://platform.openai.com/account/api-keys) )
34
+ 2. A material project's API key ( [Check it here](https://materialsproject.org/api#api-key) )
35
+ ''')
36
+ with gr.Accordion("List of properties we developed tools for", open=False):
37
+ gr.Markdown(f"""
38
+ Classification tasks: Stability, magnetism, gap_direct, metal,
39
+ regression tasks: band_gap, volume, density, atomic_density, formation energy per atom, energy per atom, electronic energy, ionic energy, total energy
40
+ """)
41
+ openai_api_key = gr.Textbox(
42
+ label="OpenAI API Key", placeholder="sk-...", type="password")
43
+ mapi_api_key = gr.Textbox(
44
+ label="Material Project API Key", placeholder="...", type="password")
45
+ with gr.Tab("MAPI Query"):
46
+ text_input = gr.Textbox(label="", placeholder="Enter question here...")
47
+ text_output = gr.Textbox()
48
+ text_button = gr.Button("Query!")
49
+
50
+ text_button.click(agent_run, inputs=[text_input, openai_api_key, mapi_api_key], outputs=text_output)
51
+
52
+ demo.launch()
mapi_tools.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mp_api.client import MPRester
2
+ from emmet.core.summary import HasProps
3
+ import openai
4
+ import langchain
5
+ from langchain import OpenAI
6
+ from langchain import agents
7
+ from langchain.agents import initialize_agent
8
+ from langchain.agents import Tool, tool
9
+ from langchain import LLMMathChain, SerpAPIWrapper
10
+ from gpt_index import GPTListIndex, GPTIndexMemory
11
+ from langchain import SerpAPIWrapper
12
+ from langchain.prompts.few_shot import FewShotPromptTemplate
13
+ from langchain.prompts.prompt import PromptTemplate
14
+ from langchain.vectorstores import FAISS, Chroma
15
+ from langchain.embeddings import OpenAIEmbeddings
16
+ from langchain.prompts.example_selector import (MaxMarginalRelevanceExampleSelector,
17
+ SemanticSimilarityExampleSelector)
18
+ import requests
19
+ from rdkit import Chem
20
+ import pandas as pd
21
+ import os
22
+
23
+ class MAPITools:
24
+ def __init__(self):
25
+ self.model = 'text-ada-001' #maybe change to gpt-4 when ready
26
+ self.k=10
27
+
28
+ def get_material_atoms(self, formula):
29
+ '''Receives a material formula and returns the atoms symbols present in it separated by comma.'''
30
+ import re
31
+ pattern = re.compile(r"([A-Z][a-z]*)(\d*)")
32
+ matches = pattern.findall(formula)
33
+ atoms = []
34
+ for m in matches:
35
+ atom, count = m
36
+ count = int(count) if count else 1
37
+ atoms.append((atom, count))
38
+ return ",".join([a[0] for a in atoms])
39
+
40
+ def check_prop_by_formula(self, formula):
41
+ raise NotImplementedError('Should be implemented in children classes')
42
+
43
+ def search_similars_by_atom(self, atoms):
44
+ '''This function receives a string with the atoms separated by comma as input and returns a list of similar materials'''
45
+ atoms = atoms.replace(" ", "")
46
+ with MPRester(os.getenv("MAPI_API_KEY")) as mpr:
47
+ docs = mpr.summary.search(elements=atoms.split(','), fields=["formula_pretty", self.prop])
48
+ return docs
49
+
50
+ def create_context_prompt(self, formula):
51
+ raise NotImplementedError('Should be implemented in children classes')
52
+
53
+ def LLM_predict(self, prompt):
54
+ ''' This function receives a prompt generate with context by the create_context_prompt tool and request a completion to a language model. Then returns the completion'''
55
+ llm = OpenAI(
56
+ model_name=self.model,
57
+ temperature=0.7,
58
+ n=1,
59
+ best_of=5,
60
+ top_p=1.0,
61
+ stop=["\n\n", "###", "#", "##"],
62
+ # model_kwargs=kwargs,
63
+ )
64
+ return llm.generate([prompt]).generations[0][0].text
65
+
66
+ def get_tools(self):
67
+ return [
68
+ Tool(
69
+ name = "Get atoms in material",
70
+ func = self.get_material_atoms,
71
+ description = (
72
+ "Receives a material formula and returns the atoms symbols present in it separated by comma."
73
+ )
74
+ ),
75
+ Tool(
76
+ name = f"Checks if material is {self.prop_name} by formula",
77
+ func = self.check_prop_by_formula,
78
+ description = (
79
+ f"This functions searches in the material project's API for the formula and returns if it is {self.prop_name} or not."
80
+ )
81
+ ),
82
+ # Tool(
83
+ # name = "Search similar materials by atom",
84
+ # func = self.search_similars_by_atom,
85
+ # description = (
86
+ # "This function receives a string with the atoms separated by comma as input and returns a list of similar materials."
87
+ # )
88
+ # ),
89
+ Tool(
90
+ name = f"Create {self.prop_name} context to LLM search",
91
+ func = self.create_context_prompt,
92
+ description = (
93
+ f"This function received a material formula as input and create a prompt to be inputed in the LLM_predict tool to predict if the material is {self.prop_name}."
94
+ if isinstance(self, MAPI_class_tools) else
95
+ f"This function received a material formula as input and create a prompt to be inputed in the LLM_predict tool to predict the {self.prop_name} of a material."
96
+ )
97
+ ),
98
+ Tool(name = "LLM predictiom",
99
+ func = self.LLM_predict,
100
+ description = (
101
+ "This function receives a prompt generate with context by the create_context_prompt tool and request a completion to a language model. Then returns the completion"
102
+ )
103
+ )
104
+ ]
105
+
106
+ class MAPI_class_tools(MAPITools):
107
+ def __init__(self, prop, prop_name, p_label, n_label):
108
+ super().__init__()
109
+ self.prop = prop
110
+ self.prop_name = prop_name
111
+ self.p_label = p_label
112
+ self.n_label = n_label
113
+
114
+ def check_prop_by_formula(self, formula):
115
+ f''' This functions searches in the material project's API for the formula and returns if it is {self.prop_name} or not'''
116
+ with MPRester(os.getenv("MAPI_API_KEY")) as mpr:
117
+ docs = mpr.summary.search(formula=formula, fields=["formula_pretty", self.prop])
118
+ if docs:
119
+ if docs[0].formula_pretty == formula:
120
+ return self.p_label if docs[0].dict()[self.prop] else self.n_label
121
+ return f"Could not find any material while searching {formula}"
122
+
123
+ def create_context_prompt(self, formula):
124
+ '''This function received a material formula as input and create a prompt to be inputed in the LLM_predict tool to predict if the formula is a stable material '''
125
+ elements = self.get_material_atoms(formula)
126
+ similars = self.search_similars_by_atom(elements)
127
+ similars = [
128
+ {'formula': ex.formula_pretty,
129
+ 'prop': self.p_label if ex.dict()[self.prop] else self.n_label
130
+ } for ex in similars
131
+ ]
132
+ examples = pd.DataFrame(similars).drop_duplicates().to_dict(orient="records")
133
+ example_selector = MaxMarginalRelevanceExampleSelector.from_examples(
134
+ examples,
135
+ OpenAIEmbeddings(),
136
+ FAISS,
137
+ k=self.k,
138
+ )
139
+
140
+ prefix=(
141
+ f'You are a bot who can predict if a material is {self.prop_name}.\n'
142
+ f'Given this list of known materials and the information if they are {self.p_label} or {self.n_label}, \n'
143
+ f'you need to answer the question if the last material is {self.prop_name}:'
144
+ )
145
+ prompt_template=PromptTemplate(
146
+ input_variables=["formula", "prop"],
147
+ template=f"Is {{formula}} a {self.prop_name} material?@@@\n{{prop}}###",
148
+ )
149
+ suffix = f"Is {{formula}} a {self.prop_name} material?@@@\n"
150
+ prompt = FewShotPromptTemplate(
151
+ # examples=examples,
152
+ example_prompt=prompt_template,
153
+ example_selector=example_selector,
154
+ prefix=prefix,
155
+ suffix=suffix,
156
+ input_variables=["formula"])
157
+
158
+ return prompt.format(formula=formula)
159
+
160
+ class MAPI_reg_tools(MAPITools):
161
+ # TODO: deal with units
162
+ def __init__(self, prop, prop_name):
163
+ super().__init__()
164
+ self.prop = prop
165
+ self.prop_name = prop_name
166
+
167
+ def check_prop_by_formula(self, formula):
168
+ ''' This functions searches in the material project's API for the formula and returns if it is stable or not'''
169
+ with MPRester(os.getenv("MAPI_API_KEY")) as mpr:
170
+ docs = mpr.summary.search(formula=formula, fields=["formula_pretty", self.prop])
171
+ if docs:
172
+ if docs[0].formula_pretty == formula:
173
+ return docs[0].dict()[self.prop]
174
+ elif docs[0].dict()[self.prop] is None:
175
+ return f"There is no record of {self.prop_name} for {formula}"
176
+ return f"Could not find any material while searching {formula}"
177
+
178
+ def create_context_prompt(self, formula):
179
+ f'''This function received a material formula as input and create a prompt to be inputed in the LLM_predict tool to predict the {self.prop_name} of the material '''
180
+ elements = self.get_material_atoms(formula)
181
+ similars = self.search_similars_by_atom(elements)
182
+ similars = [
183
+ {'formula': ex.formula_pretty,
184
+ 'prop': f"{ex.dict()[self.prop]:2f}" if ex.dict()[self.prop] is not None else None
185
+ } for ex in similars
186
+ ]
187
+ examples = pd.DataFrame(similars).drop_duplicates().dropna().to_dict(orient="records")
188
+
189
+ example_selector = MaxMarginalRelevanceExampleSelector.from_examples(
190
+ examples,
191
+ OpenAIEmbeddings(),
192
+ FAISS,
193
+ k=self.k,
194
+ )
195
+
196
+ prefix=(
197
+ f'You are a bot who can predict the {self.prop_name} of a material .\n'
198
+ f'Given this list of known materials and the measurement of their {self.prop_name}, \n'
199
+ f'you need to answer the what is the {self.prop_name} of the material:'
200
+ 'The answer should be numeric and finish with ###'
201
+ )
202
+ prompt_template=PromptTemplate(
203
+ input_variables=["formula", "prop"],
204
+ template=f"What is the {self.prop_name} for {{formula}}?@@@\n{{prop}}###",
205
+ )
206
+ suffix = f"What is the {self.prop_name} for {{formula}}?@@@\n"
207
+ prompt = FewShotPromptTemplate(
208
+ # examples=examples,
209
+ example_prompt=prompt_template,
210
+ example_selector=example_selector,
211
+ prefix=prefix,
212
+ suffix=suffix,
213
+ input_variables=["formula"])
214
+
215
+ return prompt.format(formula=formula)
utils.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.agents import Tool, tool
2
+ import requests
3
+ from langchain import OpenAI
4
+ from langchain import LLMMathChain, SerpAPIWrapper
5
+ from rdkit import Chem
6
+
7
+ @tool
8
+ def query2smiles(text):
9
+ '''This function queries the one given molecule name and returns a SMILES string from the record'''
10
+ try:#query the PubChem database
11
+ r = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + text + '/property/IsomericSMILES/JSON')
12
+ #convert the response to a json object
13
+ data = r.json()
14
+ #return the SMILES string
15
+ smi = data['PropertyTable']['Properties'][0]['IsomericSMILES']
16
+ # remove salts
17
+ return smi
18
+ except:
19
+ f"Could not find the IUPAC name for {text}"
20
+
21
+ @tool
22
+ def smiles2IUPAC(text):
23
+ '''This function queries the one given smiles name and returns a IUPAC name from the record'''
24
+ #query the PubChem database
25
+ try:
26
+ r = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/' + text + '/property/IUPACName/JSON')
27
+ data = r.json()
28
+ smi = data["PropertyTable"]["Properties"][0]["IUPACName"]
29
+ return smi
30
+ except:
31
+ return f"Could not find the IUPAC name for {text}"
32
+
33
+ @tool
34
+ def formula2IUPAC(text):
35
+ '''This function queries the one given chemical formula and returns a material name from the record.'''
36
+ try:
37
+ r = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/formula/' + text + '/property/IUPACName/JSON')
38
+ data = r.json()
39
+ print(data)
40
+ smi = data["PropertyTable"]["Properties"][0]["IUPACName"]
41
+ return smi
42
+ except:
43
+ return f"Could not find the IUPAC name for {text}"
44
+
45
+ @tool
46
+ def name2formula(text):
47
+ '''This function queries the one given material name and returns a chemical formula from the record.'''
48
+ try:
49
+ r = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + text + '/property/MolecularFormula/JSON')
50
+ data = r.json()
51
+ print(data)
52
+ smi = data["PropertyTable"]["Properties"][0]["MolecularFormula"]
53
+ return smi
54
+ except:
55
+ return f"Could not find the molecular formula for {text}"
56
+
57
+ @tool
58
+ def canonicalizeSMILES(smiles):
59
+ '''Given a smiles representation, this function returns a canonicalized version of the same smiles.
60
+ It's better to search for molecules in its canonicalized form'''
61
+ return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
62
+
63
+ @tool
64
+ def web_search(keywords, search_engine="google"):
65
+ '''Useful to do a simple google search.
66
+ Use this tool to find general information from websites.
67
+ Use keywords for your search.
68
+ '''
69
+ return SerpAPIWrapper(
70
+ serpapi_api_key=os.getenv("SERP_API_KEY"),
71
+ search_engine=search_engine
72
+ ).run(keywords)
73
+
74
+ @tool
75
+ def LLM_predict(prompt):
76
+ ''' This function receives a prompt generate with context by the create_context_prompt tool and request a completion to a language model. Then returns the completion'''
77
+ llm = OpenAI(
78
+ model_name='text-ada-001', #TODO: Maybe change to gpt-4 when ready
79
+ temperature=0.7,
80
+ n=1,
81
+ best_of=5,
82
+ top_p=1.0,
83
+ stop=["\n\n", "###", "#", "##"],
84
+ # model_kwargs=kwargs,
85
+ )
86
+ return llm.generate([prompt]).generations[0][0].text
87
+
88
+ common_tools = [
89
+ query2smiles,
90
+ smiles2IUPAC,
91
+ # formula2IUPAC,
92
+ # name2formula,
93
+ canonicalizeSMILES,
94
+ web_search,
95
+ LLM_predict
96
+ ]