问题
有的扰动数据集比如hek293T里面大概有18000多个扰动,单文件就100多G,再读进内存比如计算差异基因就太大了。所以我们需要切分再处理,但是太大的文件,你先读进去再切一部分保存,这件事情我就反复爆内存,所以写下来记录一下。
import scanpy as sc
import numpy as np
adata = sc.read_h5ad('/xxx/hek293t.h5', backed='r')
all_perts = [p for p in adata.obs["target_gene"].unique() if p != "non-targeting"]
all_perts = sorted(all_perts)
pert_splits = np.array_split(all_perts, 10)
for i, pert_list in enumerate(pert_splits, start=1):
print(f"[INFO] Generating part {i}, perturbations: {len(pert_list)}")
idx = adata.obs["target_gene"].isin(list(pert_list) + ["non-targeting"])
# ⚡ 直接在 backed 模式下 copy 到新文件
sub_adata = adata[idx].copy(
filename=f'/xxx/hek293t_part{i}.h5'
)
print(f"[INFO] Saved to hek293t_part{i}.h5")