from Bio import Entrez # =====查看数据库概况===== # 获取 Entrez 所有数据库的句柄 hd_info = Entrez.einfo() # 获取所有数据库列表 read_info = Entrez.read(hd_info) for db in read_info['DbList']: from Bio import Entrez # =====查看数据库概况===== # 获取 Entrez 所有数据库的句柄 hd_info = Entrez.einfo() # 获取所有数据库列表 read_info = Entrez.read(hd_info) for db in read_info['DbList']: print (db)
from Bio import Entrez # =====全局搜索===== hd_egquery = Entrez.egquery(term="oct4") read_egquery = Entrez.read(hd_egquery) print(read_egquery) for ele in read_egquery["eGQueryResult"]: print (ele["DbName"], ele["Count"], ele[from Bio import Entrez # =====全局搜索===== hd_egquery = Entrez.egquery(term="oct4") read_egquery = Entrez.read(hd_egquery) print(read_egquery) for ele in read_egquery["eGQueryResult"]: print (ele["DbName"], ele["Count"], ele["Status"])
from Bio import Entrez # =====获取摘要===== # 通过 id 来获取 item 的详细信息 hd_esummary = Entrez.esummary(db="gene", id="397784") read_esummary = Entrez.read(hd_esummary) # 获取该基因的详细描述 for key, value in read_esummary['DocumentSummarySet']['DocumentSummary'][0].items(): from Bio import Entrez # =====获取摘要===== # 通过 id 来获取 item 的详细信息 hd_esummary = Entrez.esummary(db="gene", id="397784") read_esummary = Entrez.read(hd_esummary) # 获取该基因的详细描述 for key, value in read_esummary['DocumentSummarySet']['DocumentSummary'][0].items(): print(key, value)
from Bio import Entrez # =====搜索交叉引用条目===== # 接下来我们看看 id 为 5460 的基因相关的文献资料 read_elink = Entrez.read(Entrez.elink(dbfrom="gene", db="pubmed", id="5460")) print ("LinkSetDb: ", read_elink[0]["LinkSetDb"]) # 查看所有相关的目标库 for lsd in read_elink[0]["LinkSetDb"]: print (lsd["DbTo"], lsd["LinkName"], len(lsd["Link"])) # 查看相关的所有文献 Id for link in read_elink[0]["LinkSetDb"][0]["Link"]: print (link[from Bio import Entrez # =====搜索交叉引用条目===== # 接下来我们看看 id 为 5460 的基因相关的文献资料 read_elink = Entrez.read(Entrez.elink(dbfrom="gene", db="pubmed", id="5460")) print ("LinkSetDb: ", read_elink[0]["LinkSetDb"]) # 查看所有相关的目标库 for lsd in read_elink[0]["LinkSetDb"]: print (lsd["DbTo"], lsd["LinkName"], len(lsd["Link"])) # 查看相关的所有文献 Id for link in read_elink[0]["LinkSetDb"][0]["Link"]: print (link["Id"])
你要上传的 id 的列表会以 url 的形式上传到服务器,这里有一个问题,如果 id 很多,就会导致url很长。但是在 HTTP 的协议中,上传一般以 GET 形式,这种方式会限制 url 的长度,也就是说如果用户上传的 URL 太长就会只能局限在一定的长度内,而不能完整的上传到服务器。 为了解决这个问题,只能使用 POST 方式上传,它没有限制文本长度,随后以 HTTP 头文件的形式上传服务器,并以历史记录的形式存储在服务器
mkdir ncbi cd ncbi mkdir ags mkdir tool cd tool wget ftp://ftp.ncbi.nlm.nih.gov/toolbox/ncbi_tools/converters/by_program/gene2xml/linux64.gene2xml.gz gunzip linux64.gene2xml.gz mv linux64.gene2xml gene2xml cd ../ags wget ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/ASN_BINARY/Mammalia/Homo_sapiens.ags.gz gunzip Homo_sapiens.ags.gz ../tool/gene2xml -b T -i Homo_sapiens.ags -o Homo_sapiens.xml
下载你的目录结构类似这样,这里的Homo_sapiens.xml 大约有15G(2018.09)
使用 BioPython 解析
from Bio import Entrez # =====解析大文件===== hd_parse = open("Homo_sapiens.xml") res_parse = Entrez.parse(hd_parse) for record in res_parse: status = record['Entrezgene_track-info']['Gene-track']['Gene-track_status'] if status.attributes['value']=='discontinued': continue geneid = record['Entrezgene_track-info']['Gene-track']['Gene-track_geneid'] genename = record['Entrezgene_gene']['Gene-ref']['Gene-ref_locus'] from Bio import Entrez # =====解析大文件===== hd_parse = open("Homo_sapiens.xml") res_parse = Entrez.parse(hd_parse) for record in res_parse: status = record['Entrezgene_track-info']['Gene-track']['Gene-track_status'] if status.attributes['value']=='discontinued': continue geneid = record['Entrezgene_track-info']['Gene-track']['Gene-track_geneid'] genename = record['Entrezgene_gene']['Gene-ref']['Gene-ref_locus'] print (geneid, genename)