Spaces:

NiniCat
/

CRISPRTool

Sleeping

App Files Files Community

supercat666 commited on Jan 8, 2024

Commit

0d0c645

1 Parent(s): fc0071d

added cas9 off

Browse files

Files changed (13) hide show

.idea/.gitignore +3 -0
.idea/CRISPRTool.iml +10 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
app.py +148 -85
cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_structure.json +1 -0
cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_weights.h5 +3 -0
cas9_model/on-cla.h5 +3 -0
cas9off.py +119 -0
cas9on.py +78 -0
tiger.md → crisprTool.md +0 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# 默认忽略的文件
+/shelf/
+/workspace.xml

.idea/CRISPRTool.iml ADDED Viewed

	@@ -0,0 +1,10 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (CRISPRTool)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/CRISPRTool.iml" filepath="$PROJECT_DIR$/.idea/CRISPRTool.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import os
 import tiger
 import pandas as pd
 import streamlit as st
 from pathlib import Path
 # title and documentation
-st.markdown(Path('tiger.md').read_text(), unsafe_allow_html=True)
 st.divider()
 CRISPR_MODELS = ['Cas9', 'Cas12', 'Cas13d']
@@ -13,107 +15,169 @@ CRISPR_MODELS = ['Cas9', 'Cas12', 'Cas13d']
 selected_model = st.selectbox('Select CRISPR model:', CRISPR_MODELS, key='selected_model')
-# Check if the selected model is Cas9
-if selected_model == 'Cas9':
-    # Use a radio button to select enzymes, making sure only one can be selected at a time
-    enzyme_selection = st.radio(
-        "Select an enzyme:",
-        ('SPCas9_U6', 'SPCas9_t7', 'eSPCas9', 'SPCas9_HF1'),
-        key='enzyme_selection'
-    )
-    # Actions based on the selected enzyme
-    if enzyme_selection == 'SPCas9_U6':
-        # Placeholder for action when SPCas9_U6 is selected
-        pass
-    elif enzyme_selection == 'SPCas9_t7':
-        # Placeholder for action when SPCas9_t7 is selected
-        pass
-    elif enzyme_selection == 'eSPCas9':
-        # Placeholder for action when eSPCas9 is selected
-        pass
-    elif enzyme_selection == 'SPCas9_HF1':
-        # Placeholder for action when SPCas9_HF1 is selected
-        pass
-elif selected_model == 'Cas12':
-        # Placeholder for Cas12 model loading
-        # TODO: Implement Cas12 model loading logic
-        raise NotImplementedError("Cas12 model loading not implemented yet.")
-elif selected_model == 'Cas13d':
-        ENTRY_METHODS = dict(
-        manual='Manual entry of single transcript',
-        fasta="Fasta file upload (supports multiple transcripts if they have unique ID's)"
-        )
-        @st.cache_data
-        def convert_df(df):
             # IMPORTANT: Cache the conversion to prevent computation on every rerun
             return df.to_csv().encode('utf-8')
-        def mode_change_callback():
-            if st.session_state.mode in {tiger.RUN_MODES['all'], tiger.RUN_MODES['titration']}:  # TODO: support titration
-                st.session_state.check_off_targets = False
-                st.session_state.disable_off_target_checkbox = True
-            else:
-                st.session_state.disable_off_target_checkbox = False
-        def progress_update(update_text, percent_complete):
-            with progress.container():
-                st.write(update_text)
-                st.progress(percent_complete / 100)
-        def initiate_run():
-            # initialize state variables
-            st.session_state.transcripts = None
-            st.session_state.input_error = None
-            st.session_state.on_target = None
-            st.session_state.titration = None
-            st.session_state.off_target = None
-            # initialize transcript DataFrame
-            transcripts = pd.DataFrame(columns=[tiger.ID_COL, tiger.SEQ_COL])
-            # manual entry
             if st.session_state.entry_method == ENTRY_METHODS['manual']:
-                transcripts = pd.DataFrame({
-                    tiger.ID_COL: ['ManualEntry'],
-                    tiger.SEQ_COL: [st.session_state.manual_entry]
-                }).set_index(tiger.ID_COL)
-            # fasta file upload
-            elif st.session_state.entry_method == ENTRY_METHODS['fasta']:
-                if st.session_state.fasta_entry is not None:
-                    fasta_path = st.session_state.fasta_entry.name
-                    with open(fasta_path, 'w') as f:
-                        f.write(st.session_state.fasta_entry.getvalue().decode('utf-8'))
-                    transcripts = tiger.load_transcripts([fasta_path], enforce_unique_ids=False)
-                    os.remove(fasta_path)
-            # convert to upper case as used by tokenizer
-            transcripts[tiger.SEQ_COL] = transcripts[tiger.SEQ_COL].apply(lambda s: s.upper().replace('U', 'T'))
-            # ensure all transcripts have unique identifiers
-            if transcripts.index.has_duplicates:
-                st.session_state.input_error = "Duplicate transcript ID's detected in fasta file"
-            # ensure all transcripts only contain nucleotides A, C, G, T, and wildcard N
-            elif not all(transcripts[tiger.SEQ_COL].apply(lambda s: set(s).issubset(tiger.NUCLEOTIDE_TOKENS.keys()))):
-                st.session_state.input_error = 'Transcript(s) must only contain upper or lower case A, C, G, and Ts or Us'
-            # ensure all transcripts satisfy length requirements
-            elif any(transcripts[tiger.SEQ_COL].apply(lambda s: len(s) < tiger.TARGET_LEN)):
-                st.session_state.input_error = 'Transcript(s) must be at least {:d} bases.'.format(tiger.TARGET_LEN)
-            # run model if we have any transcripts
-            elif len(transcripts) > 0:
-                st.session_state.transcripts = transcripts
         if __name__ == '__main__':
             # app initialization
             if 'mode' not in st.session_state:
                 st.session_state.mode = tiger.RUN_MODES['all']
@@ -235,5 +299,4 @@ elif selected_model == 'Cas13d':
                 )
                 st.session_state.transcripts = None
                 st.experimental_rerun()
-else:
-    raise ValueError(f"Unknown model: {model_name}")

 import os
 import tiger
+import cas9on
+import cas9off
 import pandas as pd
 import streamlit as st
 from pathlib import Path
 # title and documentation
+st.markdown(Path('crisprTool.md').read_text(), unsafe_allow_html=True)
 st.divider()
 CRISPR_MODELS = ['Cas9', 'Cas12', 'Cas13d']
 selected_model = st.selectbox('Select CRISPR model:', CRISPR_MODELS, key='selected_model')
+@st.cache_data
+def convert_df(df):
             # IMPORTANT: Cache the conversion to prevent computation on every rerun
             return df.to_csv().encode('utf-8')
+def mode_change_callback():
+        if st.session_state.mode in {tiger.RUN_MODES['all'], tiger.RUN_MODES['titration']}:  # TODO: support titration
+            st.session_state.check_off_targets = False
+            st.session_state.disable_off_target_checkbox = True
+        else:
+            st.session_state.disable_off_target_checkbox = False
+def progress_update(update_text, percent_complete):
+        with progress.container():
+            st.write(update_text)
+            st.progress(percent_complete / 100)
+def initiate_run():
+        # initialize state variables
+        st.session_state.transcripts = None
+        st.session_state.input_error = None
+        st.session_state.on_target = None
+        st.session_state.titration = None
+        st.session_state.off_target = None
+        # initialize transcript DataFrame
+        transcripts = pd.DataFrame(columns=[tiger.ID_COL, tiger.SEQ_COL])
+        # manual entry
+        if st.session_state.entry_method == ENTRY_METHODS['manual']:
+            transcripts = pd.DataFrame({
+                tiger.ID_COL: ['ManualEntry'],
+                tiger.SEQ_COL: [st.session_state.manual_entry]
+            }).set_index(tiger.ID_COL)
+        # fasta file upload
+        elif st.session_state.entry_method == ENTRY_METHODS['fasta']:
+            if st.session_state.fasta_entry is not None:
+                fasta_path = st.session_state.fasta_entry.name
+                with open(fasta_path, 'w') as f:
+                    f.write(st.session_state.fasta_entry.getvalue().decode('utf-8'))
+                transcripts = tiger.load_transcripts([fasta_path], enforce_unique_ids=False)
+                os.remove(fasta_path)
+        # convert to upper case as used by tokenizer
+        transcripts[tiger.SEQ_COL] = transcripts[tiger.SEQ_COL].apply(lambda s: s.upper().replace('U', 'T'))
+        # ensure all transcripts have unique identifiers
+        if transcripts.index.has_duplicates:
+                st.session_state.input_error = "Duplicate transcript ID's detected in fasta file"
+        # ensure all transcripts only contain nucleotides A, C, G, T, and wildcard N
+        elif not all(transcripts[tiger.SEQ_COL].apply(lambda s: set(s).issubset(tiger.NUCLEOTIDE_TOKENS.keys()))):
+            st.session_state.input_error = 'Transcript(s) must only contain upper or lower case A, C, G, and Ts or Us'
+        # ensure all transcripts satisfy length requirements
+        elif any(transcripts[tiger.SEQ_COL].apply(lambda s: len(s) < tiger.TARGET_LEN)):
+            st.session_state.input_error = 'Transcript(s) must be at least {:d} bases.'.format(tiger.TARGET_LEN)
+        # run model if we have any transcripts
+        elif len(transcripts) > 0:
+            st.session_state.transcripts = transcripts
+# Check if the selected model is Cas9
+if selected_model == 'Cas9':
+    # Use a radio button to select enzymes, making sure only one can be selected at a time
+    target_selection = st.radio(
+        "Select either on-target or off-target:",
+        ('on-target', 'off-target'),
+        key='target_selection'
+    )
+    # Actions based on the selected enzyme
+    if target_selection == 'on-target':
+        pass
+    elif target_selection == 'off-target':
+        ENTRY_METHODS = dict(
+            manual='Manual entry of target sequence',
+            txt="txt file upload"
+        )
+        if __name__ == '__main__':
+            # app initialization for Cas9 off-target
+            if 'target_sequence' not in st.session_state:
+                st.session_state.target_sequence = None
+            if 'input_error' not in st.session_state:
+                st.session_state.input_error = None
+            if 'off_target_results' not in st.session_state:
+                st.session_state.off_target_results = None
+            # target sequence entry
+            st.selectbox(
+                label='How would you like to provide target sequences?',
+                options=ENTRY_METHODS.values(),
+                key='entry_method',
+                disabled=st.session_state.target_sequence is not None
+            )
             if st.session_state.entry_method == ENTRY_METHODS['manual']:
+                st.text_input(
+                    label='Enter on/off sequences:',
+                    key='manual_entry',
+                    placeholder='Enter on/off sequences like:GGGTGGGGGGAGTTTGCTCCAGG,AGGTGGGGTGA_TTTGCTCCAGG',
+                    disabled=st.session_state.target_sequence is not None
+                )
+            elif st.session_state.entry_method == ENTRY_METHODS['txt']:
+                st.file_uploader(
+                    label='Upload a txt file:',
+                    key='txt_entry',
+                    disabled=st.session_state.target_sequence is not None
+                )
+            # prediction button
+            st.button(label='Predict off-target effects', on_click=cas9off.CRISPR_net_predict,
+                      disabled=st.session_state.target_sequence is not None)
+            progress = st.empty()
+            # input error display
+            error = st.empty()
+            if st.session_state.input_error is not None:
+                error.error(st.session_state.input_error, icon="🚨")
+            else:
+                error.empty()
+            # off-target results display
+            off_target_results = st.empty()
+            if st.session_state.off_target_results is not None:
+                with off_target_results.container():
+                    if len(st.session_state.off_target_results) > 0:
+                        st.write('Off-target predictions:', st.session_state.off_target_results)
+                        st.download_button(
+                            label='Download off-target predictions',
+                            data=convert_df(st.session_state.off_target_results),
+                            file_name='off_target_results.csv',
+                            mime='text/csv'
+                        )
+                    else:
+                        st.write('No significant off-target effects detected!')
+            else:
+                off_target_results.empty()
+            # running the CRISPR-Net model for off-target predictions
+            if st.session_state.target_sequence is not None:
+                st.session_state.off_target_results = cas9off.predict_off_targets(
+                    target_sequence=st.session_state.target_sequence,
+                    status_update_fn=progress_update
+                )
+                st.session_state.target_sequence = None
+                st.experimental_rerun()
+elif selected_model == 'Cas12':
+        # Placeholder for Cas12 model loading
+        # TODO: Implement Cas12 model loading logic
+        raise NotImplementedError("Cas12 model loading not implemented yet.")
+elif selected_model == 'Cas13d':
+        ENTRY_METHODS = dict(
+        manual='Manual entry of single transcript',
+        fasta="Fasta file upload (supports multiple transcripts if they have unique ID's)"
+        )
         if __name__ == '__main__':
             # app initialization
             if 'mode' not in st.session_state:
                 st.session_state.mode = tiger.RUN_MODES['all']
                 )
                 st.session_state.transcripts = None
                 st.experimental_rerun()

cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_structure.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"class_name": "Model", "config": {"name": "model_1", "layers": [{"name": "main_input", "class_name": "InputLayer", "config": {"batch_input_shape": [null, 1, 24, 7], "dtype": "float32", "sparse": false, "name": "main_input"}, "inbound_nodes": []}, {"name": "conv2d_1", "class_name": "Conv2D", "config": {"name": "conv2d_1", "trainable": true, "filters": 10, "kernel_size": [1, 1], "strides": [1, 1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["main_input", 0, 0, {}]]]}, {"name": "conv2d_2", "class_name": "Conv2D", "config": {"name": "conv2d_2", "trainable": true, "filters": 10, "kernel_size": [1, 2], "strides": [1, 1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["main_input", 0, 0, {}]]]}, {"name": "conv2d_3", "class_name": "Conv2D", "config": {"name": "conv2d_3", "trainable": true, "filters": 10, "kernel_size": [1, 3], "strides": [1, 1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["main_input", 0, 0, {}]]]}, {"name": "conv2d_4", "class_name": "Conv2D", "config": {"name": "conv2d_4", "trainable": true, "filters": 10, "kernel_size": [1, 5], "strides": [1, 1], "padding": "same", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["main_input", 0, 0, {}]]]}, {"name": "activation_1", "class_name": "Activation", "config": {"name": "activation_1", "trainable": true, "activation": "relu"}, "inbound_nodes": [[["conv2d_1", 0, 0, {}]]]}, {"name": "activation_2", "class_name": "Activation", "config": {"name": "activation_2", "trainable": true, "activation": "relu"}, "inbound_nodes": [[["conv2d_2", 0, 0, {}]]]}, {"name": "activation_3", "class_name": "Activation", "config": {"name": "activation_3", "trainable": true, "activation": "relu"}, "inbound_nodes": [[["conv2d_3", 0, 0, {}]]]}, {"name": "activation_4", "class_name": "Activation", "config": {"name": "activation_4", "trainable": true, "activation": "relu"}, "inbound_nodes": [[["conv2d_4", 0, 0, {}]]]}, {"name": "concatenate_1", "class_name": "Concatenate", "config": {"name": "concatenate_1", "trainable": true, "axis": -1}, "inbound_nodes": [[["main_input", 0, 0, {}], ["activation_1", 0, 0, {}], ["activation_2", 0, 0, {}], ["activation_3", 0, 0, {}], ["activation_4", 0, 0, {}]]]}, {"name": "reshape_1", "class_name": "Reshape", "config": {"name": "reshape_1", "trainable": true, "target_shape": [24, 47]}, "inbound_nodes": [[["concatenate_1", 0, 0, {}]]]}, {"name": "bidirectional_1", "class_name": "Bidirectional", "config": {"name": "bidirectional_1", "trainable": true, "layer": {"class_name": "LSTM", "config": {"name": "LSTM_out", "trainable": true, "batch_input_shape": [null, 24, 47], "dtype": "float32", "return_sequences": true, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 15, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 1}}, "merge_mode": "concat"}, "inbound_nodes": [[["reshape_1", 0, 0, {}]]]}, {"name": "flatten_1", "class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true, "data_format": "channels_last"}, "inbound_nodes": [[["bidirectional_1", 0, 0, {}]]]}, {"name": "dense_1", "class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 80, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["flatten_1", 0, 0, {}]]]}, {"name": "dense_2", "class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 20, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["dense_1", 0, 0, {}]]]}, {"name": "dropout_1", "class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.35, "noise_shape": null, "seed": null}, "inbound_nodes": [[["dense_2", 0, 0, {}]]]}, {"name": "main_output", "class_name": "Dense", "config": {"name": "main_output", "trainable": true, "units": 1, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["dropout_1", 0, 0, {}]]]}], "input_layers": [["main_input", 0, 0]], "output_layers": [["main_output", 0, 0]]}, "keras_version": "2.2.4", "backend": "tensorflow"}

cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_weights.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7f6aa381520f5c68fa1f099a6ef3ebc3b8ce846709b97dfde2053f26ca62f80
+size 312432

cas9_model/on-cla.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5acf8f740cf326052ad08db2ca71d7204526c61f6a9fcdca36e15004bc16ad04
+size 34044032

cas9off.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import tensorflow as tf
+import numpy as np
+import pandas as pd
+import os
+import argparse
+# column names
+ID_COL = 'Transcript ID'
+SEQ_COL = 'Transcript Sequence'
+# configure GPUs
+for gpu in tf.config.list_physical_devices('GPU'):
+    tf.config.experimental.set_memory_growth(gpu, enable=True)
+if len(tf.config.list_physical_devices('GPU')) > 0:
+    tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
+# application configuration
+BATCH_SIZE_COMPUTE = 500
+BATCH_SIZE_SCAN = 20
+BATCH_SIZE_TRANSCRIPTS = 50
+NUM_TOP_GUIDES = 10
+NUM_MISMATCHES = 3
+RUN_MODES = dict(
+    all='All on-target guides per transcript',
+    top_guides='Top {:d} guides per transcript'.format(NUM_TOP_GUIDES),
+    titration='Top {:d} guides per transcript & their titration candidates'.format(NUM_TOP_GUIDES)
+)
+class Encoder:
+    def __init__(self, on_seq, off_seq, with_category = False, label = None, with_reg_val = False, value = None):
+        tlen = 24
+        self.on_seq = "-" *(tlen-len(on_seq)) +  on_seq
+        self.off_seq = "-" *(tlen-len(off_seq)) + off_seq
+        self.encoded_dict_indel = {'A': [1, 0, 0, 0, 0], 'T': [0, 1, 0, 0, 0],
+                                   'G': [0, 0, 1, 0, 0], 'C': [0, 0, 0, 1, 0], '_': [0, 0, 0, 0, 1], '-': [0, 0, 0, 0, 0]}
+        self.direction_dict = {'A':5, 'G':4, 'C':3, 'T':2, '_':1}
+        if with_category:
+            self.label = label
+        if with_reg_val:
+            self.value = value
+        self.encode_on_off_dim7()
+    def encode_sgRNA(self):
+        code_list = []
+        encoded_dict = self.encoded_dict_indel
+        sgRNA_bases = list(self.on_seq)
+        for i in range(len(sgRNA_bases)):
+            if sgRNA_bases[i] == "N":
+                sgRNA_bases[i] = list(self.off_seq)[i]
+            code_list.append(encoded_dict[sgRNA_bases[i]])
+        self.sgRNA_code = np.array(code_list)
+    def encode_off(self):
+        code_list = []
+        encoded_dict = self.encoded_dict_indel
+        off_bases = list(self.off_seq)
+        for i in range(len(off_bases)):
+            code_list.append(encoded_dict[off_bases[i]])
+        self.off_code = np.array(code_list)
+    def encode_on_off_dim7(self):
+        self.encode_sgRNA()
+        self.encode_off()
+        on_bases = list(self.on_seq)
+        off_bases = list(self.off_seq)
+        on_off_dim7_codes = []
+        for i in range(len(on_bases)):
+            diff_code = np.bitwise_or(self.sgRNA_code[i], self.off_code[i])
+            on_b = on_bases[i]
+            off_b = off_bases[i]
+            if on_b == "N":
+                on_b = off_b
+            dir_code = np.zeros(2)
+            if on_b == "-" or off_b == "-" or self.direction_dict[on_b] == self.direction_dict[off_b]:
+                pass
+            else:
+                if self.direction_dict[on_b] > self.direction_dict[off_b]:
+                    dir_code[0] = 1
+                else:
+                    dir_code[1] = 1
+            on_off_dim7_codes.append(np.concatenate((diff_code, dir_code)))
+        self.on_off_code = np.array(on_off_dim7_codes)
+def encode_on_off_seq_pairs(input_file):
+    inputs = pd.read_csv(input_file, delimiter=",", header=None, names=['on_seq', 'off_seq'])
+    input_codes = []
+    for idx, row in inputs.iterrows():
+        on_seq = row['on_seq']
+        off_seq = row['off_seq']
+        en = Encoder(on_seq=on_seq, off_seq=off_seq)
+        input_codes.append(en.on_off_code)
+    input_codes = np.array(input_codes)
+    input_codes = input_codes.reshape((len(input_codes), 1, 24, 7))
+    y_pred = CRISPR_net_predict(input_codes)
+    inputs['CRISPR_Net_score'] = y_pred
+    inputs.to_csv("CRISPR_net_results.csv", index=False)
+def CRISPR_net_predict(X_test):
+    json_file = open("cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_structure.json", 'r')
+    loaded_model_json = json_file.read()
+    json_file.close()
+    loaded_model = tf.keras.models.model_from_json(loaded_model_json)  # Updated for TensorFlow 2
+    loaded_model.load_weights("cas9_model/CRISPR_Net_CIRCLE_elevation_SITE_weights.h5")
+    y_pred = loaded_model.predict(X_test).flatten()
+    return y_pred
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="CRISPR-Net v1.0 (Aug 10 2019)")
+    parser.add_argument("input_file",
+                        help="input_file example (on-target seq, off-target seq):\n GAGT_CCGAGCAGAAGAAGAATGG,GAGTACCAAGTAGAAGAAAAATTT\n"
+                             "GTTGCCCCACAGGGCAGTAAAGG,GTGGACACCCCGGGCAGGAAAGG\n"
+                             "GGGTGGGGGGAGTTTGCTCCAGG,AGGTGGGGTGA_TTTGCTCCAGG")
+    args = parser.parse_args()
+    file = args.input_file
+    if not os.path.exists(args.input_file):
+        print("File doesn't exist!")
+    else:
+        encode_on_off_seq_pairs(file)
+        tf.keras.backend.clear_session()

cas9on.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import tensorflow as tf
+import pandas as pd
+import numpy as np
+from operator import add
+from functools import reduce
+# configure GPUs
+for gpu in tf.config.list_physical_devices('GPU'):
+    tf.config.experimental.set_memory_growth(gpu, enable=True)
+if len(tf.config.list_physical_devices('GPU')) > 0:
+    tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
+ntmap = {'A': (1, 0, 0, 0),
+         'C': (0, 1, 0, 0),
+         'G': (0, 0, 1, 0),
+         'T': (0, 0, 0, 1)
+         }
+epimap = {'A': 1, 'N': 0}
+def get_seqcode(seq):
+    return np.array(reduce(add, map(lambda c: ntmap[c], seq.upper()))).reshape(
+        (1, len(seq), -1))
+def get_epicode(eseq):
+    return np.array(list(map(lambda c: epimap[c], eseq))).reshape(1, len(eseq), -1)
+class Episgt:
+    def __init__(self, fpath, num_epi_features, with_y=True):
+        self._fpath = fpath
+        self._ori_df = pd.read_csv(fpath, sep='\t', index_col=None, header=None)
+        self._num_epi_features = num_epi_features
+        self._with_y = with_y
+        self._num_cols = num_epi_features + 2 if with_y else num_epi_features + 1
+        self._cols = list(self._ori_df.columns)[-self._num_cols:]
+        self._df = self._ori_df[self._cols]
+    @property
+    def length(self):
+        return len(self._df)
+    def get_dataset(self, x_dtype=np.float32, y_dtype=np.float32):
+        x_seq = np.concatenate(list(map(get_seqcode, self._df[self._cols[0]])))
+        x_epis = np.concatenate([np.concatenate(list(map(get_epicode, self._df[col]))) for col in
+                                 self._cols[1: 1 + self._num_epi_features]], axis=-1)
+        x = np.concatenate([x_seq, x_epis], axis=-1).astype(x_dtype)
+        x = x.transpose(0, 2, 1)
+        if self._with_y:
+            y = np.array(self._df[self._cols[-1]]).astype(y_dtype)
+            return x, y
+        else:
+            return x
+from keras.models import load_model
+class DCModelOntar:
+    def __init__(self, ontar_model_dir, is_reg=False):
+        if is_reg:
+            self.model = load_model(ontar_model_dir)
+        else:
+            self.model = load_model(ontar_model_dir)
+    def ontar_predict(self, x, channel_first=True):
+        if channel_first:
+            x = x.transpose([0, 2, 3, 1])
+        yp = self.model.predict(x)
+        return yp.ravel()
+def predict():
+    file_path = 'eg_cls_on_target.episgt'
+    input_data = Episgt(file_path, num_epi_features=4, with_y=True)
+    x, y = input_data.get_dataset()
+    x = np.expand_dims(x, axis=2)  # shape(x) = [100, 8, 1, 23]
+    dcModel = DCModelOntar('on-cla.h5')
+    predicted_on_target = dcModel.ontar_predict(x)
+    return predicted_on_target

tiger.md → crisprTool.md RENAMED Viewed

File without changes