根据PMID批量获取NCBI摘要页内容
'根据PMID号,获取Pubmed摘要页面的所有内容(题目和摘要等。)NBK开头的PMID在程序自动搜索时会自动将NBK去掉,所以不适用于此代码生成。'
__author__ = 'shixq'
# source://www.greatytc.com/p/b36adf266c3d
from Bio import Entrez
Entrez.email = "2576755886@qq.com" # 输入自己的邮箱,这里使用一个我随便申请的QQ邮箱
# with open('PMID.txt') as pmid_file
pmid_file = open('PMID.txt')# 输入文件不要有空行,筛掉NBK开头的文献。
output = open('PMID_abstract.txt', 'w', encoding='utf-8')
id_list = []
abstracts = []
for element in pmid_file:
id_list.append(element.strip())# 去掉'/n'
pmid_file.close()
count = 0
for pid in id_list:
handle = Entrez.efetch(db="pubmed", id=pid, rettype="abstract", retmode="text")# Entrez 里的efetch模块获取摘要页面的text内容。
re =[line.strip() for line in handle.readlines()] #将所有的换行输出排列在一行上
# print(re)
# record = [handle.read().strip()]
# all_ab.append(record)
# print(record)
# all_ab.append(record)
# for sab in record:
# print(pid + '\t' + sab + '\n')
output.write(pid + '\t' + ''.join(re) + '\n')#将一个list内的所有元素不换行输出
output.close()
count += 1
print('complete', '%.1f%%'%((count/len(id_list))*100))# 在屏幕上打印完成的进度百分比
# try:
# ab = record['PubmedArticle']
# print(ab)
# for a in ab:
# print(a)
# except KeyError:
# ab = ['none abstract']
# except IndexError:
# ab = ['AAA:This PMID may be a book & Documents, no abstract']
# for abstract in ab:
# if pid == pid:
# abstract = abstract +
# count += 1
# print('complete', '%.1f%%'%((count/len(id_list))*100))
# output.write(pid + '\t' + sab + '\n')ssss