TianlaiChen commited on
Commit
1250540
·
verified ·
1 Parent(s): a0d4f3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -30
app.py CHANGED
@@ -13,47 +13,48 @@ def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
13
  sequence = protein_seq + binder_seq
14
  tensor_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
15
  total_loss = 0
16
-
17
  # Loop through each token in the binder sequence
18
  for i in range(-len(binder_seq)-1, -1):
19
  # Create a copy of the original tensor
20
  masked_input = tensor_input.clone()
21
-
22
  # Mask one token at a time
23
  masked_input[0, i] = tokenizer.mask_token_id
 
24
  # Create labels
25
  labels = torch.full(tensor_input.shape, -100).to(model.device)
26
  labels[0, i] = tensor_input[0, i]
27
-
28
  # Get model prediction and loss
29
  with torch.no_grad():
30
  outputs = model(masked_input, labels=labels)
31
  total_loss += outputs.loss.item()
32
-
33
  # Calculate the average loss
34
  avg_loss = total_loss / len(binder_seq)
35
-
36
  # Calculate pseudo perplexity
37
  pseudo_perplexity = np.exp(avg_loss)
 
38
  return pseudo_perplexity
39
 
40
-
41
  def generate_peptide(protein_seq, peptide_length, top_k, num_binders):
42
-
43
  peptide_length = int(peptide_length)
44
  top_k = int(top_k)
45
  num_binders = int(num_binders)
46
-
47
  binders_with_ppl = []
48
-
49
- for _ in range(num_binders):
50
  # Generate binder
51
  masked_peptide = '<mask>' * peptide_length
52
  input_sequence = protein_seq + masked_peptide
 
53
  inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
54
-
55
  with torch.no_grad():
56
  logits = model(**inputs).logits
 
57
  mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
58
  logits_at_masks = logits[0, mask_token_indices]
59
 
@@ -62,25 +63,23 @@ def generate_peptide(protein_seq, peptide_length, top_k, num_binders):
62
  probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
63
  predicted_indices = Categorical(probabilities).sample()
64
  predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
65
-
66
  generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
67
-
68
  # Compute PPL for the generated binder
69
  ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
70
-
71
  # Add the generated binder and its PPL to the results list
72
  binders_with_ppl.append([generated_binder, ppl_value])
73
-
74
- # Convert the list of lists to a pandas dataframe
75
- df = pd.DataFrame(binders_with_ppl, columns=["Binder", "Perplexity"])
76
 
77
- # Save the dataframe to a CSV file
78
- output_filename = "output.csv"
79
- df.to_csv(output_filename, index=False)
80
-
81
-
82
- return binders_with_ppl, output_filename
83
-
 
84
 
85
  # Define the Gradio interface
86
  interface = gr.Interface(
@@ -92,12 +91,12 @@ interface = gr.Interface(
92
  gr.Dropdown(choices=[1, 2, 4, 8, 16, 32], label="Number of Binders", value=1)
93
  ],
94
  outputs=[
95
- gr.Dataframe(
96
- headers=["Binder", "Perplexity"],
97
- datatype=["str", "number"],
98
- col_count=(2, "fixed")
99
- ),
100
- gr.outputs.File(label="Download CSV")
101
  ],
102
  title="PepMLM: Target Sequence-Conditioned Generation of Peptide Binders via Masked Language Modeling"
103
  )
 
13
  sequence = protein_seq + binder_seq
14
  tensor_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
15
  total_loss = 0
16
+
17
  # Loop through each token in the binder sequence
18
  for i in range(-len(binder_seq)-1, -1):
19
  # Create a copy of the original tensor
20
  masked_input = tensor_input.clone()
 
21
  # Mask one token at a time
22
  masked_input[0, i] = tokenizer.mask_token_id
23
+
24
  # Create labels
25
  labels = torch.full(tensor_input.shape, -100).to(model.device)
26
  labels[0, i] = tensor_input[0, i]
27
+
28
  # Get model prediction and loss
29
  with torch.no_grad():
30
  outputs = model(masked_input, labels=labels)
31
  total_loss += outputs.loss.item()
32
+
33
  # Calculate the average loss
34
  avg_loss = total_loss / len(binder_seq)
35
+
36
  # Calculate pseudo perplexity
37
  pseudo_perplexity = np.exp(avg_loss)
38
+
39
  return pseudo_perplexity
40
 
 
41
  def generate_peptide(protein_seq, peptide_length, top_k, num_binders):
 
42
  peptide_length = int(peptide_length)
43
  top_k = int(top_k)
44
  num_binders = int(num_binders)
45
+
46
  binders_with_ppl = []
47
+
48
+ for _ in range(num_binders): # Fixed: underscore instead of asterisk
49
  # Generate binder
50
  masked_peptide = '<mask>' * peptide_length
51
  input_sequence = protein_seq + masked_peptide
52
+
53
  inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
54
+
55
  with torch.no_grad():
56
  logits = model(**inputs).logits
57
+
58
  mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
59
  logits_at_masks = logits[0, mask_token_indices]
60
 
 
63
  probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
64
  predicted_indices = Categorical(probabilities).sample()
65
  predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
66
+
67
  generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
68
+
69
  # Compute PPL for the generated binder
70
  ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
71
+
72
  # Add the generated binder and its PPL to the results list
73
  binders_with_ppl.append([generated_binder, ppl_value])
 
 
 
74
 
75
+ # Convert the list of lists to a pandas dataframe
76
+ df = pd.DataFrame(binders_with_ppl, columns=["Binder", "Perplexity"])
77
+
78
+ # Save the dataframe to a CSV file
79
+ output_filename = "output.csv"
80
+ df.to_csv(output_filename, index=False)
81
+
82
+ return df, output_filename # Return dataframe instead of list
83
 
84
  # Define the Gradio interface
85
  interface = gr.Interface(
 
91
  gr.Dropdown(choices=[1, 2, 4, 8, 16, 32], label="Number of Binders", value=1)
92
  ],
93
  outputs=[
94
+ gr.Dataframe(
95
+ headers=["Binder", "Perplexity"],
96
+ datatype=["str", "number"],
97
+ col_count=(2, "fixed")
98
+ ),
99
+ gr.File(label="Download CSV") # Fixed: Use gr.File instead of gr.outputs.File
100
  ],
101
  title="PepMLM: Target Sequence-Conditioned Generation of Peptide Binders via Masked Language Modeling"
102
  )