使用pybiomart 查询ensemble ID对应的信息
0.目录
- 查询基本方法
- 使用Ensemble ID查询对应基因组位置以及对应链
- 使用Ensemble ID查询对应protein序列
- 更换参考基因组版本(因为有些Ensemble ID在人类基因组
GRCh38中查询不到) - 使用Ensemble ID查询GRGh37有GRGh38里面没有的EnsembleID,然后将坐标转换回GRG38里面的位置。
1.查询基本方法
先建立Dataset,然后使用query函数去搜寻,在attributes里指定对应想要查询的,下面filters输入自己使用什么查询,比如这里就是Ensemble ID,可以传入列表。
from pybiomart import Dataset
dataset = Dataset(name='hsapiens_gene_ensembl', host='http://grch37.ensembl.org')
df37 = dataset.query(
attributes=[
'ensembl_gene_id', # gene_id
],
filters={'link_ensembl_gene_id': gene_ids}
)
这里看看attributes里和filters里面有什么
from pybiomart import Dataset from pybiomart 导入数据集from pybiomart import 来自 pybiomart 导入数据集的数据集from pybiomart import 来自 pybiomart 的数据集 导入数据集from pybiomart import 来自 pybiomart 导入数据集的数据集
dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') 数据集 = 数据集(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')数据集 = 数据集(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')
# 查看所有可用的 filters
print(dataset.filters) # 只显示前10个print(dataset.filters) # 只显示前10个print(dataset.filters) # 只显示前10个print(dataset.filters) # 只显示前10个
# 查看所有可用的 attributes
print(dataset.attributes) # 只显示前10个print(dataset.attributes) # 只显示前10个print(dataset.attributes) # 只显示前10个print(dataset.attributes) # 只显示前10个
# {'link_so_mini_closure': <biomart.Filter name='link_so_mini_closure',type='list'>, # {'link_so_mini_closure': ,xxx
# {'ensembl_gene_id': <biomart.Attribute name='ensembl_gene_id', display_name='Gene stable ID', description='Stable ID of the Gene'>,xxx# {'ensembl_gene_id': ,xxx
2.使用Ensemble ID查询对应基因组位置以及对应链
from pybiomart import Dataset
dataset = Dataset(name='hsapiens_gene_ensembl', host='http://grch37.ensembl.org')
gene_ids = missing_rows['gene_id'].tolist()
df37 = dataset.query(
attributes=[
'ensembl_gene_id', # gene_id
'external_gene_name', # gene_name
'chromosome_name',
'start_position',
'end_position',
'strand'
],
filters={'link_ensembl_gene_id': gene_ids}
)
3.使用Ensemble ID查询对应protein序列
# 示例 gene_id 列表
gene_ids = ["ENSG00000141510", "ENSG00000155657"] # TP53, BRCA1gene_ids = [“ENSG00000141510”, “ENSG00000155657”] # TP53, BRCA1
df_protein = dataset.query( df_protein = dataset.query(
attributes=[ 属性=[
'ensembl_gene_id', 'ensembl_gene_id',
'ensembl_transcript_id', 'ensembl_transcript_id',
'ensembl_peptide_id',
'peptide' # 氨基酸序列
],
filters={'link_ensembl_gene_id': gene_ids}
)
print(df_protein.head())
4. 更换参考基因组版本
from pybiomart import Dataset from pybiomart 导入数据集from pybiomart import 来自 pybiomart 导入数据集的数据集
# 默认是38
dataset = Dataset(name='hsapiens_gene_ensembl', 数据集 = 数据集(name='hsapiens_gene_ensembl',
host='http://www.ensembl.org') # 默认 GRCh38host='http://www.ensembl.org') # 默认 GRCh38
# 选择 GRCh37 的 BioMart
dataset = Dataset(name='hsapiens_gene_ensembl', host='http://grch37.ensembl.org') 数据集 = 数据集(name='hsapiens_gene_ensembl', host='http://grch37.ensembl.org')dataset = Dataset(name='hsapiens_gene_ensembl', host='http://grch37.ensembl.org') 数据集 = 数据集(name='hsapiens_gene_ensembl', host='http://grch37.ensembl.org')
5.使用Ensemble ID查询GRGh37有GRGh38里面没有的EnsembleID,然后将坐标转换回GRG38里面的位置。
import pandas as pd
from pybiomart import Dataset
from pyliftover import LiftOver
# -----------------------------
# 1️⃣ 读取 CSV
# -----------------------------
merged_df = pd.read_csv("adata_features_with_annotation.csv")
print(f"总行数: {len(merged_df)}")
# 筛选缺失 chromosome 或 start 的行
missing_rows = merged_df[merged_df["chromosome"].isna() | merged_df["start"].isna()]
print(f"缺失行数: {len(missing_rows)}")
# -----------------------------
# 2️⃣ 在 GRCh37 上查基因位置(包含 gene_name)
# -----------------------------
dataset = Dataset(name='hsapiens_gene_ensembl', host='http://grch37.ensembl.org')
gene_ids = missing_rows['gene_id'].tolist()
df37 = dataset.query(
attributes=[
'ensembl_gene_id', # gene_id
'external_gene_name', # gene_name
'chromosome_name',
'start_position',
'end_position',
'strand'
],
filters={'link_ensembl_gene_id': gene_ids}
)
# -----------------------------
# 3️⃣ LiftOver GRCh37 → GRCh38
# -----------------------------
lo = LiftOver('hg19', 'hg38')
def liftover_row(row):
chr_ = f"chr{row['Chromosome/scaffold name']}" if not str(row['Chromosome/scaffold name']).startswith("chr") else row['Chromosome/scaffold name']
start = int(row['Gene start (bp)'])
end = int(row['Gene end (bp)'])
lifted_start = lo.convert_coordinate(chr_, start)
lifted_end = lo.convert_coordinate(chr_, end)
if lifted_start and lifted_end:
new_chr = lifted_start[0][0].replace('chr','')
new_start = lifted_start[0][1]
new_end = lifted_end[0][1]
strand = '+' if row['Strand'] in [1, '+', '1'] else '-'
return pd.Series({'chromosome': new_chr, 'start': new_start, 'end': new_end, 'strand': strand})
else:
return pd.Series({'chromosome': None, 'start': None, 'end': None, 'strand': None})
lifted_coords = df37.apply(liftover_row, axis=1)
# -----------------------------
# 4️⃣ 补回原 CSV(包括 gene_name)
# -----------------------------
# 映射字典
gene_name_dict = df37.set_index('Gene stable ID')['Gene name'].to_dict()
chrom_dict = lifted_coords.set_index(df37['Gene stable ID'])['chromosome'].to_dict()
start_dict = lifted_coords.set_index(df37['Gene stable ID'])['start'].to_dict()
end_dict = lifted_coords.set_index(df37['Gene stable ID'])['end'].to_dict()
strand_dict = lifted_coords.set_index(df37['Gene stable ID'])['strand'].to_dict()
# 补充缺失值
merged_df['gene_name'] = merged_df.apply(lambda row: gene_name_dict.get(row['gene_id'], row['gene_name']), axis=1)
merged_df['chromosome'] = merged_df.apply(lambda row: chrom_dict.get(row['gene_id'], row['chromosome']), axis=1)
merged_df['start'] = merged_df.apply(lambda row: start_dict.get(row['gene_id'], row['start']), axis=1)
merged_df['end'] = merged_df.apply(lambda row: end_dict.get(row['gene_id'], row['end']), axis=1)
merged_df['strand'] = merged_df.apply(lambda row: strand_dict.get(row['gene_id'], row['strand']), axis=1)
# -----------------------------
# 5️⃣ 按原列顺序保存
# -----------------------------
merged_df = merged_df[['gene_id', 'gene_name', 'chromosome', 'start', 'end', 'strand']]
merged_df.to_csv("adata_features_with_annotation_filled.csv", index=False)
print("补全完成,已保存到 adata_features_with_annotation_filled.csv")