Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions sanclone/tools/load_insert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from Bio import Entrez, SeqIO
import json
from sanclone.tools import settings


from langchain.tools import BaseTool

from ..state import State


class ParseGeneTool(BaseTool):
name = "get_insert_from_name"
description = """A tool that takes in a json object of the
form {'gene_name': ..., 'organism': ...} and update the
internal state to include the linear insert GenBank file. """
shared_state: State

def _run(self, query: str) -> str:
# Assume query is a json object of the form {"gene_name": "gene_name", "organism": "organism"}
qson = json.loads(query)
gene_name = qson['gene_name']
organism = qson['organism']
seq_record = fetch_sequence(gene_name, organism)
if seq_record is not None:
self.shared_state.linear_insert = seq_record
return f"Sequence {seq_record.description} is loaded. "
else:
return "Could not find Sequence"

def fetch_sequence(gene_name, organism):
Entrez.email = settings.email # Always tell NCBI who you
search_term = f"{gene_name}[Gene Name] AND {organism}[Organism] AND mRNA[Filter]"

# Search for the gene's mRNA ID
handle = Entrez.esearch(db="nucleotide", term=search_term, retmax=1)
record = Entrez.read(handle)
handle.close()

if not record["IdList"]:
# print("No sequence found!")
return None

gene_id = record["IdList"][0]

# Fetch the sequence based on the ID
handle = Entrez.efetch(db="nucleotide", id=gene_id, rettype="fasta", retmode="text")
seq_record = SeqIO.read(handle, "fasta")
handle.close()

return seq_record
65 changes: 65 additions & 0 deletions sanclone/tools/load_virus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from langchain.tools import BaseTool
from Bio import Entrez
from Bio import SeqIO
import os

from ..state import State
from sanclone.tools import settings


class ParseVirusTool(BaseTool):
name = "get_virus_from_name"
description = """Use this function when being asked to take in a virus name
to get the bioPython seqRecord object.
Input: a virus name, output: a virus genbank internal file"""
shared_state: State

def _run(self, query: str) -> str:
# Assume vector is vector name ParseVirusTool()._run('pET-16b') -> seq Record
genbank_filename = get_vector_data(query, settings.OUTPUT_FOLDER)
seqObj = list(SeqIO.parse(open(genbank_filename,"r"), "genbank"))[0]
if seqObj is not None:
self.shared_state.vector = seqObj
return f"Vector {seqObj.description} is loaded. "
else:
return "Could not find Vector"


def get_vector_data(vector_name):
# Set your email address for NCBI Entrez. This is required.
output_folder = settings.OUTPUT_FOLDER
Entrez.email = settings.email

# Define the search query using the vector_name input
search_query = vector_name

# Use Entrez to search for GenBank records
search_handle = Entrez.esearch(db="nucleotide", term=search_query)
search_results = Entrez.read(search_handle)
search_handle.close()

# Check if any results were found
if "IdList" not in search_results or not search_results["IdList"]:
print(f"No GenBank records found for {vector_name}")
return

# Extract the first GenBank ID from the search results
genbank_id = search_results["IdList"][0]

# Download the GenBank record and save it to a file
fetch_handle = Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text")
genbank_record = SeqIO.read(fetch_handle, "genbank")
fetch_handle.close()

# Save the GenBank record to a file
if not os.path.exists(output_folder):
os.makedirs(output_folder)
filename = os.path.join(output_folder, f"{genbank_record.id}.gbk")
SeqIO.write(genbank_record, filename, "genbank")

#print(f"Downloaded GenBank file for {vector_name} to {filename}")
return filename


def get_genbank_from_soup(query):
return None
2 changes: 2 additions & 0 deletions sanclone/tools/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
email = 'youremail@example.com'
OUTPUT_FOLDER = './'