supercat666 commited on
Commit
d51aeae
1 Parent(s): 114492c
Files changed (2) hide show
  1. app.py +16 -21
  2. cas9on.py +56 -93
app.py CHANGED
@@ -11,6 +11,7 @@ from pathlib import Path
11
  import zipfile
12
  import io
13
  import gtracks
 
14
 
15
 
16
 
@@ -275,34 +276,29 @@ if selected_model == 'Cas9':
275
  gene_sequence = st.session_state['gene_sequence']
276
 
277
  # Define file paths
278
- # genbank_file_path = f"{gene_symbol}_crispr_targets.gb"
279
- # bed_file_path = f"{gene_symbol}_crispr_targets.bed"
280
- # csv_file_path = f"{gene_symbol}_crispr_predictions.csv"
281
- bigwig_file_path = f"{gene_symbol}_crispr_predictions.bw"
282
 
283
- # Generate files
284
- # cas9on.generate_genbank_file_from_df(df, gene_sequence, gene_symbol, genbank_file_path)
285
- # cas9on.create_bed_file_from_df(df, bed_file_path)
286
- # cas9on.create_csv_from_df(df, csv_file_path)
287
 
288
- # Assuming create_bigwig is a function that generates a BigWig file from the DataFrame
289
- cas9on.create_bigwig(df, bigwig_file_path)
 
 
290
 
291
  # Prepare an in-memory buffer for the ZIP file
292
  zip_buffer = io.BytesIO()
293
  with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
294
  # For each file, add it to the ZIP file
295
- # zip_file.write(genbank_file_path)
296
- # zip_file.write(bed_file_path)
297
- # zip_file.write(csv_file_path)
298
- zip_file.write(bigwig_file_path)
299
 
300
  # Important: move the cursor to the beginning of the BytesIO buffer before reading it
301
  zip_buffer.seek(0)
302
 
303
- track = gtracks.Track(bigwig_file_path)
304
- plot = gtracks.Plot(tracks=[track])
305
-
306
  # Specify the region you want to visualize
307
  min_start = df['Start Pos'].min()
308
  max_end = df['End Pos'].max()
@@ -310,14 +306,13 @@ if selected_model == 'Cas9':
310
  region = f"{chromosome}:{min_start}-{max_end}"
311
 
312
  # Generate the pyGenomeTracks plot
313
- plot_image_path = f"{gene_symbol}_gtracks_plot.png"
314
- plot.plot(region=region, output_file=plot_image_path)
315
- # Display the pyGenomeTracks plot image in Streamlit
316
  st.image(plot_image_path)
317
 
318
  # Display the download button for the ZIP file
319
  st.download_button(
320
- label="Download GenBank, BED, CSV, and BigWig files as ZIP",
321
  data=zip_buffer.getvalue(),
322
  file_name=f"{gene_symbol}_files.zip",
323
  mime="application/zip"
 
11
  import zipfile
12
  import io
13
  import gtracks
14
+ import subprocess
15
 
16
 
17
 
 
276
  gene_sequence = st.session_state['gene_sequence']
277
 
278
  # Define file paths
279
+ genbank_file_path = f"{gene_symbol}_crispr_targets.gb"
280
+ bed_file_path = f"{gene_symbol}_crispr_targets.bed"
281
+ csv_file_path = f"{gene_symbol}_crispr_predictions.csv"
282
+ plot_image_path = f"{gene_symbol}_gtracks_plot.png"
283
 
 
 
 
 
284
 
285
+ # Generate files
286
+ cas9on.generate_genbank_file_from_df(df, gene_sequence, gene_symbol, genbank_file_path)
287
+ cas9on.create_bed_file_from_df(df, bed_file_path)
288
+ cas9on.create_csv_from_df(df, csv_file_path)
289
 
290
  # Prepare an in-memory buffer for the ZIP file
291
  zip_buffer = io.BytesIO()
292
  with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
293
  # For each file, add it to the ZIP file
294
+ zip_file.write(genbank_file_path)
295
+ zip_file.write(bed_file_path)
296
+ zip_file.write(csv_file_path)
297
+
298
 
299
  # Important: move the cursor to the beginning of the BytesIO buffer before reading it
300
  zip_buffer.seek(0)
301
 
 
 
 
302
  # Specify the region you want to visualize
303
  min_start = df['Start Pos'].min()
304
  max_end = df['End Pos'].max()
 
306
  region = f"{chromosome}:{min_start}-{max_end}"
307
 
308
  # Generate the pyGenomeTracks plot
309
+ gtracks_command = f"gtracks {region} {bed_file_path} {plot_image_path}"
310
+ subprocess.run(gtracks_command, shell=True)
 
311
  st.image(plot_image_path)
312
 
313
  # Display the download button for the ZIP file
314
  st.download_button(
315
+ label="Download GenBank, BED, CSV files as ZIP",
316
  data=zip_buffer.getvalue(),
317
  file_name=f"{gene_symbol}_files.zip",
318
  mime="application/zip"
cas9on.py CHANGED
@@ -147,100 +147,63 @@ def process_gene(gene_symbol, model_path):
147
  return results, all_gene_sequences, all_exons
148
 
149
 
150
- # def create_genbank_features(data):
151
- # features = []
152
- #
153
- # # If the input data is a DataFrame, convert it to a list of lists
154
- # if isinstance(data, pd.DataFrame):
155
- # formatted_data = data.values.tolist()
156
- # elif isinstance(data, list):
157
- # formatted_data = data
158
- # else:
159
- # raise TypeError("Data should be either a list or a pandas DataFrame.")
160
- #
161
- # for row in formatted_data:
162
- # try:
163
- # start = int(row[1])
164
- # end = int(row[2])
165
- # except ValueError as e:
166
- # print(f"Error converting start/end to int: {row[1]}, {row[2]} - {e}")
167
- # continue
168
- #
169
- # strand = 1 if row[3] == '+' else -1
170
- # location = FeatureLocation(start=start, end=end, strand=strand)
171
- # feature = SeqFeature(location=location, type="misc_feature", qualifiers={
172
- # 'label': row[7], # Use gRNA as the label
173
- # 'note': f"Prediction: {row[8]}" # Include the prediction score
174
- # })
175
- # features.append(feature)
176
- #
177
- # return features
178
- #
179
- #
180
- # def generate_genbank_file_from_df(df, gene_sequence, gene_symbol, output_path):
181
- # features = create_genbank_features(df)
182
- # record = SeqRecord(Seq(gene_sequence), id=gene_symbol, name=gene_symbol,
183
- # description=f'CRISPR Cas9 predicted targets for {gene_symbol}', features=features)
184
- # record.annotations["molecule_type"] = "DNA"
185
- # SeqIO.write(record, output_path, "genbank")
186
- #
187
- #
188
- # def create_bed_file_from_df(df, output_path):
189
- # with open(output_path, 'w') as bed_file:
190
- # for index, row in df.iterrows():
191
- # chrom = row["Chr"]
192
- # start = int(row["Start Pos"]) # Assuming 'Start Pos' is the column name in the df
193
- # end = int(row["End Pos"]) # Assuming 'End Pos' is the column name in the df
194
- # strand = '+' if row["Strand"] == '1' else '-' # Assuming 'Strand' is the column name in the df
195
- # gRNA = row["gRNA"]
196
- # score = str(row["Prediction"])
197
- # transcript_id = row["Transcript"] # Assuming 'Transcript' is the column name in the df
198
- #
199
- # bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\t{transcript_id}\n")
200
- #
201
- #
202
- # def create_csv_from_df(df, output_path):
203
- # df.to_csv(output_path, index=False)
204
-
205
- def create_bigwig(df, bigwig_path):
206
- # Check for required columns in the DataFrame
207
- required_columns = ["Chr", "Start Pos", "End Pos", "Prediction"]
208
- if not all(column in df.columns for column in required_columns):
209
- raise ValueError(f"DataFrame must contain {required_columns} columns.")
210
-
211
- # Convert columns to the correct types
212
- df['Start Pos'] = df['Start Pos'].astype(int)
213
- df['End Pos'] = df['End Pos'].astype(int)
214
- df['Prediction'] = df['Prediction'].astype(float)
215
-
216
- # Get the list of all chromosomes present in the DataFrame
217
- all_chromosomes = df['Chr'].unique().tolist()
218
-
219
- # Calculate chromosome sizes for the BigWig header
220
- chr_sizes = []
221
- for chr in all_chromosomes:
222
- chr_group = df[df['Chr'] == chr]
223
- max_end_pos = chr_group['End Pos'].max()
224
- chr_sizes.append((chr, max_end_pos))
225
-
226
- # Create the BigWig file and add the header
227
- bw = pyBigWig.open(bigwig_path, "w")
228
- bw.addHeader(chr_sizes)
229
-
230
- # Add entries for each chromosome
231
- for chr in all_chromosomes:
232
- chr_group = df[df['Chr'] == chr]
233
- if not chr_group.empty:
234
- starts = chr_group['Start Pos'].tolist()
235
- ends = chr_group['End Pos'].tolist()
236
- values = chr_group['Prediction'].astype(float).tolist()
237
- bw.addEntries([chr] * len(starts), starts, ends=ends, values=values)
238
- else:
239
- # Add empty entries for the missing chromosome
240
- bw.addEntries([chr], [0], ends=[1], values=[0.0])
241
 
242
- # Close the BigWig file
243
- bw.close()
244
 
245
 
246
 
 
147
  return results, all_gene_sequences, all_exons
148
 
149
 
150
+ def create_genbank_features(data):
151
+ features = []
152
+
153
+ # If the input data is a DataFrame, convert it to a list of lists
154
+ if isinstance(data, pd.DataFrame):
155
+ formatted_data = data.values.tolist()
156
+ elif isinstance(data, list):
157
+ formatted_data = data
158
+ else:
159
+ raise TypeError("Data should be either a list or a pandas DataFrame.")
160
+
161
+ for row in formatted_data:
162
+ try:
163
+ start = int(row[1])
164
+ end = int(row[2])
165
+ except ValueError as e:
166
+ print(f"Error converting start/end to int: {row[1]}, {row[2]} - {e}")
167
+ continue
168
+
169
+ strand = 1 if row[3] == '+' else -1
170
+ location = FeatureLocation(start=start, end=end, strand=strand)
171
+ feature = SeqFeature(location=location, type="misc_feature", qualifiers={
172
+ 'label': row[7], # Use gRNA as the label
173
+ 'note': f"Prediction: {row[8]}" # Include the prediction score
174
+ })
175
+ features.append(feature)
176
+
177
+ return features
178
+
179
+
180
+ def generate_genbank_file_from_df(df, gene_sequence, gene_symbol, output_path):
181
+ features = create_genbank_features(df)
182
+ record = SeqRecord(Seq(gene_sequence), id=gene_symbol, name=gene_symbol,
183
+ description=f'CRISPR Cas9 predicted targets for {gene_symbol}', features=features)
184
+ record.annotations["molecule_type"] = "DNA"
185
+ SeqIO.write(record, output_path, "genbank")
186
+
187
+
188
+ def create_bed_file_from_df(df, output_path):
189
+ with open(output_path, 'w') as bed_file:
190
+ for index, row in df.iterrows():
191
+ chrom = row["Chr"]
192
+ start = int(row["Start Pos"])
193
+ end = int(row["End Pos"])
194
+ strand = '+' if row["Strand"] == '1' else '-'
195
+ gRNA = row["gRNA"]
196
+ score = str(row["Prediction"])
197
+ # transcript_id is not typically part of the standard BED columns but added here for completeness
198
+ transcript_id = row["Transcript"]
199
+
200
+ # Writing only standard BED columns; additional columns can be appended as needed
201
+ bed_file.write(f"{chrom}\t{start}\t{end}\t{gRNA}\t{score}\t{strand}\n")
202
+
203
+
204
+ def create_csv_from_df(df, output_path):
205
+ df.to_csv(output_path, index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
 
 
207
 
208
 
209