From 5d535e3bbae5d4cbe7fab302f2c8e22fbfb824dc Mon Sep 17 00:00:00 2001 From: Albert Qu Date: Fri, 6 Oct 2023 16:38:41 -0700 Subject: [PATCH 1/3] adding first pass tools --- sanclone/tools/load_insert.py | 47 ++++++++++++++++++++++++++ sanclone/tools/load_virus.py | 63 +++++++++++++++++++++++++++++++++++ sanclone/tools/settings.py | 2 ++ 3 files changed, 112 insertions(+) create mode 100644 sanclone/tools/load_insert.py create mode 100644 sanclone/tools/load_virus.py create mode 100644 sanclone/tools/settings.py diff --git a/sanclone/tools/load_insert.py b/sanclone/tools/load_insert.py new file mode 100644 index 0000000..87f2266 --- /dev/null +++ b/sanclone/tools/load_insert.py @@ -0,0 +1,47 @@ +from Bio import Entrez, SeqIO +import json + + +from langchain.tools import BaseTool + +from ..state import State + + +class ParseGeneTool(BaseTool): + name = "parse_genes" + description = "a tool that parses in the virus prompt" + shared_state: State + + def _run(self, query: str) -> str: + # Assume query is a json object of the form {"gene_name": "gene_name", "organism": "organism"} + qson = json.loads(query) + gene_name = qson['gene_name'] + organism = qson['organism'] + seq_record = fetch_sequence(gene_name, organism) + if seq_record is not None: + self.shared_state.linear_insert = seq_record + return f"Sequence {seq_record.description} is loaded. " + else: + return "Could not find Sequence" + +def fetch_sequence(gene_name, organism): + Entrez.email = "your.email@example.com" # Always tell NCBI who you are + search_term = f"{gene_name}[Gene Name] AND {organism}[Organism] AND mRNA[Filter]" + + # Search for the gene's mRNA ID + handle = Entrez.esearch(db="nucleotide", term=search_term, retmax=1) + record = Entrez.read(handle) + handle.close() + + if not record["IdList"]: + # print("No sequence found!") + return None + + gene_id = record["IdList"][0] + + # Fetch the sequence based on the ID + handle = Entrez.efetch(db="nucleotide", id=gene_id, rettype="fasta", retmode="text") + seq_record = SeqIO.read(handle, "fasta") + handle.close() + + return seq_record \ No newline at end of file diff --git a/sanclone/tools/load_virus.py b/sanclone/tools/load_virus.py new file mode 100644 index 0000000..3937ef3 --- /dev/null +++ b/sanclone/tools/load_virus.py @@ -0,0 +1,63 @@ +from langchain.tools import BaseTool +from Bio import Entrez +from Bio import SeqIO +import os + +from ..state import State +from sanclone.tools import settings + + +class ParseVirusTool(BaseTool): + name = "parse_virus" + description = "a tool that parses in the virus prompt" + shared_state: State + + def _run(self, query: str) -> str: + # Assume vector is vector name ParseVirusTool()._run('pET-16b') -> seq Record + genbank_filename = get_vector_data(query, settings.OUTPUT_FOLDER) + seqObj = list(SeqIO.parse(open(genbank_filename,"r"), "genbank"))[0] + if seqObj is not None: + self.shared_state.vector = seqObj + return f"Vector {seqObj.description} is loaded. " + else: + return "Could not find Vector" + + +def get_vector_data(vector_name): + # Set your email address for NCBI Entrez. This is required. + output_folder = settings.OUTPUT_FOLDER + Entrez.email = settings.email + + # Define the search query using the vector_name input + search_query = vector_name + + # Use Entrez to search for GenBank records + search_handle = Entrez.esearch(db="nucleotide", term=search_query) + search_results = Entrez.read(search_handle) + search_handle.close() + + # Check if any results were found + if "IdList" not in search_results or not search_results["IdList"]: + print(f"No GenBank records found for {vector_name}") + return + + # Extract the first GenBank ID from the search results + genbank_id = search_results["IdList"][0] + + # Download the GenBank record and save it to a file + fetch_handle = Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text") + genbank_record = SeqIO.read(fetch_handle, "genbank") + fetch_handle.close() + + # Save the GenBank record to a file + if not os.path.exists(output_folder): + os.makedirs(output_folder) + filename = os.path.join(output_folder, f"{genbank_record.id}.gbk") + SeqIO.write(genbank_record, filename, "genbank") + + #print(f"Downloaded GenBank file for {vector_name} to {filename}") + return filename + + +def get_genbank_from_soup(query): + return None \ No newline at end of file diff --git a/sanclone/tools/settings.py b/sanclone/tools/settings.py new file mode 100644 index 0000000..ddfd85c --- /dev/null +++ b/sanclone/tools/settings.py @@ -0,0 +1,2 @@ +email = 'youremail@example.com' +OUTPUT_FOLDER = './' \ No newline at end of file From 0da9d6b0d115d6ed23ef3fcb438d04fcda8125e8 Mon Sep 17 00:00:00 2001 From: Albert Qu Date: Fri, 6 Oct 2023 16:50:34 -0700 Subject: [PATCH 2/3] merge this --- sanclone/tools/load_insert.py | 5 +++-- sanclone/tools/load_virus.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sanclone/tools/load_insert.py b/sanclone/tools/load_insert.py index 87f2266..52dbf6b 100644 --- a/sanclone/tools/load_insert.py +++ b/sanclone/tools/load_insert.py @@ -1,5 +1,6 @@ from Bio import Entrez, SeqIO import json +from sanclone.tools import settings from langchain.tools import BaseTool @@ -9,7 +10,7 @@ class ParseGeneTool(BaseTool): name = "parse_genes" - description = "a tool that parses in the virus prompt" + description = "A tool that takes in a json object of the form {'gene_name': ..., 'organism': ...} and update the internal state to include the linear insert GenBank file" shared_state: State def _run(self, query: str) -> str: @@ -25,7 +26,7 @@ def _run(self, query: str) -> str: return "Could not find Sequence" def fetch_sequence(gene_name, organism): - Entrez.email = "your.email@example.com" # Always tell NCBI who you are + Entrez.email = settings.email # Always tell NCBI who you search_term = f"{gene_name}[Gene Name] AND {organism}[Organism] AND mRNA[Filter]" # Search for the gene's mRNA ID diff --git a/sanclone/tools/load_virus.py b/sanclone/tools/load_virus.py index 3937ef3..e343b7a 100644 --- a/sanclone/tools/load_virus.py +++ b/sanclone/tools/load_virus.py @@ -9,7 +9,7 @@ class ParseVirusTool(BaseTool): name = "parse_virus" - description = "a tool that parses in the virus prompt" + description = "Input: a virus name, output: a virus genbank internal file" shared_state: State def _run(self, query: str) -> str: From 6e802aaaf3fca9618f3976f4cd4ecbd31f1da8d2 Mon Sep 17 00:00:00 2001 From: Albert Qu Date: Fri, 6 Oct 2023 16:56:30 -0700 Subject: [PATCH 3/3] add virus and linear insert --- sanclone/tools/load_insert.py | 6 ++++-- sanclone/tools/load_virus.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sanclone/tools/load_insert.py b/sanclone/tools/load_insert.py index 52dbf6b..77faf09 100644 --- a/sanclone/tools/load_insert.py +++ b/sanclone/tools/load_insert.py @@ -9,8 +9,10 @@ class ParseGeneTool(BaseTool): - name = "parse_genes" - description = "A tool that takes in a json object of the form {'gene_name': ..., 'organism': ...} and update the internal state to include the linear insert GenBank file" + name = "get_insert_from_name" + description = """A tool that takes in a json object of the + form {'gene_name': ..., 'organism': ...} and update the + internal state to include the linear insert GenBank file. """ shared_state: State def _run(self, query: str) -> str: diff --git a/sanclone/tools/load_virus.py b/sanclone/tools/load_virus.py index e343b7a..bf77572 100644 --- a/sanclone/tools/load_virus.py +++ b/sanclone/tools/load_virus.py @@ -8,8 +8,10 @@ class ParseVirusTool(BaseTool): - name = "parse_virus" - description = "Input: a virus name, output: a virus genbank internal file" + name = "get_virus_from_name" + description = """Use this function when being asked to take in a virus name + to get the bioPython seqRecord object. + Input: a virus name, output: a virus genbank internal file""" shared_state: State def _run(self, query: str) -> str: