Spaces:

TianlaiChen
/

PepMLM

Runtime error

App Files Files Community

TianlaiChen commited on Aug 25

Commit

1250540

verified ·

1 Parent(s): a0d4f3a

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -30

app.py CHANGED Viewed

@@ -13,47 +13,48 @@ def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
     sequence = protein_seq + binder_seq
     tensor_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
     total_loss = 0
     # Loop through each token in the binder sequence
     for i in range(-len(binder_seq)-1, -1):
         # Create a copy of the original tensor
         masked_input = tensor_input.clone()
         # Mask one token at a time
         masked_input[0, i] = tokenizer.mask_token_id
         # Create labels
         labels = torch.full(tensor_input.shape, -100).to(model.device)
         labels[0, i] = tensor_input[0, i]
         # Get model prediction and loss
         with torch.no_grad():
             outputs = model(masked_input, labels=labels)
             total_loss += outputs.loss.item()
     # Calculate the average loss
     avg_loss = total_loss / len(binder_seq)
     # Calculate pseudo perplexity
     pseudo_perplexity = np.exp(avg_loss)
     return pseudo_perplexity
 def generate_peptide(protein_seq, peptide_length, top_k, num_binders):
     peptide_length = int(peptide_length)
     top_k = int(top_k)
     num_binders = int(num_binders)
     binders_with_ppl = []
-    for _ in range(num_binders):
         # Generate binder
         masked_peptide = '<mask>' * peptide_length
         input_sequence = protein_seq + masked_peptide
         inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
         with torch.no_grad():
             logits = model(**inputs).logits
         mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
         logits_at_masks = logits[0, mask_token_indices]
@@ -62,25 +63,23 @@ def generate_peptide(protein_seq, peptide_length, top_k, num_binders):
         probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
         predicted_indices = Categorical(probabilities).sample()
         predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
         generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
         # Compute PPL for the generated binder
         ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
         # Add the generated binder and its PPL to the results list
         binders_with_ppl.append([generated_binder, ppl_value])
-        # Convert the list of lists to a pandas dataframe
-        df = pd.DataFrame(binders_with_ppl, columns=["Binder", "Perplexity"])
-        # Save the dataframe to a CSV file
-        output_filename = "output.csv"
-        df.to_csv(output_filename, index=False)
-    return binders_with_ppl, output_filename
 # Define the Gradio interface
 interface = gr.Interface(
@@ -92,12 +91,12 @@ interface = gr.Interface(
         gr.Dropdown(choices=[1, 2, 4, 8, 16, 32], label="Number of Binders", value=1)
     ],
     outputs=[
-    gr.Dataframe(
-        headers=["Binder", "Perplexity"],
-        datatype=["str", "number"],
-        col_count=(2, "fixed")
-    ),
-    gr.outputs.File(label="Download CSV")
     ],
     title="PepMLM: Target Sequence-Conditioned Generation of Peptide Binders via Masked Language Modeling"
 )

     sequence = protein_seq + binder_seq
     tensor_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
     total_loss = 0
     # Loop through each token in the binder sequence
     for i in range(-len(binder_seq)-1, -1):
         # Create a copy of the original tensor
         masked_input = tensor_input.clone()
         # Mask one token at a time
         masked_input[0, i] = tokenizer.mask_token_id
         # Create labels
         labels = torch.full(tensor_input.shape, -100).to(model.device)
         labels[0, i] = tensor_input[0, i]
         # Get model prediction and loss
         with torch.no_grad():
             outputs = model(masked_input, labels=labels)
             total_loss += outputs.loss.item()
     # Calculate the average loss
     avg_loss = total_loss / len(binder_seq)
     # Calculate pseudo perplexity
     pseudo_perplexity = np.exp(avg_loss)
     return pseudo_perplexity
 def generate_peptide(protein_seq, peptide_length, top_k, num_binders):
     peptide_length = int(peptide_length)
     top_k = int(top_k)
     num_binders = int(num_binders)
     binders_with_ppl = []
+    for _ in range(num_binders):  # Fixed: underscore instead of asterisk
         # Generate binder
         masked_peptide = '<mask>' * peptide_length
         input_sequence = protein_seq + masked_peptide
         inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
         with torch.no_grad():
             logits = model(**inputs).logits
         mask_token_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
         logits_at_masks = logits[0, mask_token_indices]
         probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)
         predicted_indices = Categorical(probabilities).sample()
         predicted_token_ids = top_k_indices.gather(-1, predicted_indices.unsqueeze(-1)).squeeze(-1)
         generated_binder = tokenizer.decode(predicted_token_ids, skip_special_tokens=True).replace(' ', '')
         # Compute PPL for the generated binder
         ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
         # Add the generated binder and its PPL to the results list
         binders_with_ppl.append([generated_binder, ppl_value])
+    # Convert the list of lists to a pandas dataframe
+    df = pd.DataFrame(binders_with_ppl, columns=["Binder", "Perplexity"])
+    # Save the dataframe to a CSV file
+    output_filename = "output.csv"
+    df.to_csv(output_filename, index=False)
+    return df, output_filename  # Return dataframe instead of list
 # Define the Gradio interface
 interface = gr.Interface(
         gr.Dropdown(choices=[1, 2, 4, 8, 16, 32], label="Number of Binders", value=1)
     ],
     outputs=[
+        gr.Dataframe(
+            headers=["Binder", "Perplexity"],
+            datatype=["str", "number"],
+            col_count=(2, "fixed")
+        ),
+        gr.File(label="Download CSV")  # Fixed: Use gr.File instead of gr.outputs.File
     ],
     title="PepMLM: Target Sequence-Conditioned Generation of Peptide Binders via Masked Language Modeling"
 )