Create genetic perturabation data

1. import the necessary libraries

[1]:
import xpert as xp
from xpert.data.utils import get_info_txt
import scanpy as sc
import os
import pickle
import warnings
import logging
warnings.filterwarnings('ignore')
sc.settings.verbosity = 0
logging.getLogger('scanpy').setLevel(logging.ERROR)
logging.getLogger('anndata').setLevel(logging.ERROR)

2. load original adata file of perturbation data

[2]:
adata = sc.read('../../data/Norman2019/NormanWeissman2019_filtered.h5ad')
adata
[2]:
AnnData object with n_obs × n_vars = 111445 × 33694
    obs: 'guide_id', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo'
    var: 'ensemble_id', 'ncounts', 'ncells'

add obs.column with ‘perturbation_new’ and ‘cell_type_new’, which is used to perform data proprecessing.

[3]:
def get_perturbation(x):
    return '; '.join(x['perturbation'].split('_'))

adata.obs['perturbation_new'] = adata.obs.apply(get_perturbation, axis=1)
adata.obs['celltype_new'] = adata.obs['celltype']

def get_perturbation_group(x):
    return ' | '.join([x['perturbation_new'], x['celltype_new']])
perturbation_group = adata.obs.apply(get_perturbation_group, axis=1)
adata.obs['perturbation_group'] = perturbation_group

adata.obs['sgRNA_new'] = 'control'

3. generate Pert_Data object

[4]:
# parameters

pert_cell_filter = 100 # this is used to filter perts, cell number less than this will be filtered
seed = 2024 # this is the random seed
split_type = 1 # 1 for unseen perts; 0 for unseen celltypes
split_ratio = [0.7, 0.2, 0.1] # train:test:val; val is used to choose data, test is for final validation
var_num = 5000 # selecting hvg number
num_de_genes = 20 # number of de genes
bs_train = 32 # batch size of trainloader
bs_test = 32 # batch size of testloader
data_dir = '../../data/Norman2019/'
[5]:
# Create Pert_Data object
pert_data = xp.data.Byte_Pert_Data(
    prefix='Norman2019',
    pert_cell_filter=pert_cell_filter,
    seed=seed,
    split_ratio=split_ratio,
    split_type=split_type,
    var_num=var_num,
    num_de_genes=num_de_genes,
    bs_train=bs_train,
    bs_test=bs_test
)

# Complete data processing pipeline
print("Step 1: Reading files...")
pert_data.read_files(adata)

print("Step 2: Filtering perturbations...")
pert_data.filter_perturbation()

print("Step 3: Preprocessing adata and selecting HVGs...")
pert_data.get_and_process_adata(var_num=pert_data.var_num)

print("Step 4: Setting control barcodes...")
pert_data.set_control_barcode()

print("Step 5: Calculating E-distances...")
pert_data.get_edis_2()

print("Step 6: Filtering sgRNAs...")
pert_data.adata_split.obs['sgRNA_new'] = 'control'
pert_data.filter_sgRNA()

print("Step 7: Data splitting...")
pert_data.data_split(split_type=1, test_perts=None)

print("Step 8: Getting differential genes...")
pert_data.get_de_genes()

print("Step 9: Saving processed data...")
# Save the processed data
pickle.dump(pert_data, open(os.path.join(data_dir, 'pert_data.pkl'), 'wb'))

print("Pert_Data object generation completed!")
print(f"Final dataset shape: {pert_data.adata_split.shape}")
print(f"Number of perturbation groups: {len(pert_data.filter_perturbation_list)}")
Step 1: Reading files...
========== read file finished!
Step 2: Filtering perturbations...
retain_pert_num is: 225
filtered pert num is:  12
100%|██████████| 236/236 [00:03<00:00, 73.45it/s]
========== filter perturbation finished!
Step 3: Preprocessing adata and selecting HVGs...
len of exclude_var_list is 4
len of pert_gene_list is 343
len of final var_names is 5040
========== get var genes finished!
this is new version
Step 4: Setting control barcodes...
100%|██████████| 224/224 [00:03<00:00, 69.05it/s]
========== set control barcodes finished!
Step 5: Calculating E-distances...
Step 6: Filtering sgRNAs...
0it [00:00, ?it/s]
Step 7: Data splitting...
========== data split finished!
Step 8: Getting differential genes...
100%|██████████| 224/224 [01:04<00:00,  3.48it/s]
========== get de genes finished!
Step 9: Saving processed data...
Pert_Data object generation completed!
Final dataset shape: (110554, 5040)
Number of perturbation groups: 224

4. Pert_Data Information Analysis and Visualization

[ ]:
# =============================================================================
# Pert_Data Information Analysis and Visualization
# =============================================================================

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# --------- 1. 分布相关的两个统计图分开画 ---------
print("1. Analyzing cell count distribution...")
pert_cell_counts = []
pert_names = []
for pert in pert_data.filter_perturbation_list:
    cell_count = len(pert_data.adata_split[pert_data.adata_split.obs['perturbation_group'] == pert])
    pert_cell_counts.append(cell_count)
    pert_names.append(pert.split(' | ')[0])  # 只取gene name

mean_cells = np.mean(pert_cell_counts)
median_cells = np.median(pert_cell_counts)

# 单独画:每个扰动的细胞数量分布(直方图)
fig1, ax1 = plt.subplots(figsize=(7, 5))
ax1.hist(pert_cell_counts, bins=30, alpha=0.7, edgecolor='black')
ax1.set_xlabel('Number of Cells per Perturbation')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of Cell Counts per Perturbation')
ax1.grid(True, alpha=0.3)
ax1.axvline(mean_cells, color='red', linestyle='--', label=f'Mean: {mean_cells:.1f}')
ax1.axvline(median_cells, color='orange', linestyle='--', label=f'Median: {median_cells:.1f}')
ax1.legend()
plt.tight_layout()
plt.show()

print(f"   - Mean cells per perturbation: {mean_cells:.1f}")
print(f"   - Median cells per perturbation: {median_cells:.1f}")
print(f"   - Total perturbations: {len(pert_cell_counts)}")

# --------- 2. E-distance分布(单独画) ---------
print("\n2. Analyzing E-distance distribution...")
fig2, ax2 = plt.subplots(figsize=(7, 5))
if hasattr(pert_data, 'filter_pert_edis') and pert_data.filter_pert_edis:
    edis_values = list(pert_data.filter_pert_edis.values())

    ax2.hist(edis_values, bins=30, alpha=0.7, edgecolor='black', color='green')
    ax2.set_xlabel('E-distance')
    ax2.set_ylabel('Frequency')
    ax2.set_title('Distribution of E-distances')
    ax2.grid(True, alpha=0.3)

    mean_edis = np.mean(edis_values)
    median_edis = np.median(edis_values)
    ax2.axvline(mean_edis, color='red', linestyle='--', label=f'Mean: {mean_edis:.3f}')
    ax2.axvline(median_edis, color='orange', linestyle='--', label=f'Median: {median_edis:.3f}')
    ax2.legend()

    print(f"   - Mean E-distance: {mean_edis:.3f}")
    print(f"   - Median E-distance: {median_edis:.3f}")
    print(f"   - Min E-distance: {np.min(edis_values):.3f}")
    print(f"   - Max E-distance: {np.max(edis_values):.3f}")

else:
    ax2.text(0.5, 0.5, 'E-distance data not available',
             ha='center', va='center', transform=ax2.transAxes)
    ax2.set_title('E-distance Distribution (Not Available)')
    print("   - E-distance data not available")
plt.tight_layout()
plt.show()

# --------- 3. UMAP单独画 ---------
print("\n3. Generating UMAP plot for specific perturbation...")
if pert_cell_counts:
    # Get perturbations sorted by cell count
    pert_cell_df = pd.DataFrame({
        'perturbation': pert_data.filter_perturbation_list,
        'cell_count': pert_cell_counts
    }).sort_values('cell_count', ascending=False)
    # 选一个cell丰度前25%里的扰动
    top_25_percent = int(len(pert_cell_df) * 0.25)
    selected_pert = pert_cell_df.iloc[top_25_percent]['perturbation']
    selected_cell_count = pert_cell_df.iloc[top_25_percent]['cell_count']

    print(f"   - Selected perturbation: {selected_pert}")
    print(f"   - Cell count: {selected_cell_count}")

    try:
        pert_data.plot_umap(
            pert=selected_pert,
            point_size=20,
            color=['perturbation_group'],
            return_adata=False
        )
    except Exception as e:
        print(f"   - UMAP generation failed: {e}")
else:
    print("   - No perturbation data available for UMAP plot.")

# ======= 4. 不画图,只保留详细统计信息 =======
# 汇报数据信息
print("\n" + "=" * 60)
print("DETAILED PERTURBATION ANALYSIS")
print("=" * 60)

# Top 10 perturbations by cell count
print("\nTop 10 perturbations by cell count:")
pert_cell_df = pd.DataFrame({
    'perturbation': pert_data.filter_perturbation_list,
    'cell_count': pert_cell_counts
}).sort_values('cell_count', ascending=False)
top_10_perts = pert_cell_df.head(10)
for idx, row in top_10_perts.iterrows():
    print(f"  {row['perturbation']}: {row['cell_count']} cells")

# Perturbation type distribution
print(f"\nPerturbation type distribution:")
pert_type_counts = pert_data.adata_split.obs['perturbation_new'].value_counts()
for pert_type, count in pert_type_counts.items():
    print(f"  {pert_type}: {count} cells")

# Cell type distribution
print(f"\nCell type distribution:")
celltype_counts = pert_data.adata_split.obs['celltype_new'].value_counts()
for celltype, count in celltype_counts.items():
    print(f"  {celltype}: {count} cells")

print("\nAnalysis completed!")

1. Analyzing cell count distribution...
../_images/tutorials_1_genetic_perturbation_data_11_1.png
   - Mean cells per perturbation: 440.6
   - Median cells per perturbation: 362.0
   - Total perturbations: 224

2. Analyzing E-distance distribution...
   - Mean E-distance: 38.914
   - Median E-distance: 28.915
   - Min E-distance: 0.285
   - Max E-distance: 203.848
../_images/tutorials_1_genetic_perturbation_data_11_3.png

3. Generating UMAP plot for specific perturbation...
   - Selected perturbation: AHR | lymphoblasts
   - Cell count: 558
/nfs/public/lichen/software/miniconda3/envs/scGPT_2/lib/python3.10/site-packages/anndata/_core/anndata.py:1818: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
  utils.warn_names_duplicates("obs")
/nfs/public/lichen/software/miniconda3/envs/scGPT_2/lib/python3.10/site-packages/anndata/_core/anndata.py:1818: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
  utils.warn_names_duplicates("obs")
/nfs/public/lichen/software/miniconda3/envs/scGPT_2/lib/python3.10/site-packages/scanpy/preprocessing/_simple.py:842: UserWarning: Received a view of an AnnData. Making a copy.
  view_to_actual(adata)
/nfs/public/lichen/software/miniconda3/envs/scGPT_2/lib/python3.10/site-packages/anndata/_core/anndata.py:1818: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
  utils.warn_names_duplicates("obs")
/nfs/public/lichen/software/miniconda3/envs/scGPT_2/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:391: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
../_images/tutorials_1_genetic_perturbation_data_11_6.png

============================================================
DETAILED PERTURBATION ANALYSIS
============================================================

Top 10 perturbations by cell count:
  KLF1 | lymphoblasts: 1960 cells
  BAK1 | lymphoblasts: 1457 cells
  CEBPE | lymphoblasts: 1233 cells
  CEBPE_RUNX1T1 | lymphoblasts: 1219 cells
  UBASH3B | lymphoblasts: 1202 cells
  ETS2 | lymphoblasts: 1201 cells
  TBX3_TBX2 | lymphoblasts: 1167 cells
  OSR2 | lymphoblasts: 1003 cells
  SLC4A1 | lymphoblasts: 1000 cells
  SET | lymphoblasts: 986 cells

Perturbation type distribution:
  control: 11855 cells
  KLF1: 1960 cells
  BAK1: 1457 cells
  CEBPE: 1233 cells
  CEBPE_RUNX1T1: 1219 cells
  UBASH3B: 1202 cells
  ETS2: 1201 cells
  TBX3_TBX2: 1167 cells
  OSR2: 1003 cells
  SLC4A1: 1000 cells
  SET: 986 cells
  ELMSAN1: 937 cells
  ETS2_CNN1: 905 cells
  MAP2K6: 878 cells
  FOXF1: 874 cells
  C19orf26: 872 cells
  FOXA1: 851 cells
  UBASH3A: 819 cells
  UBASH3B_OSR2: 796 cells
  DUSP9_ETS2: 787 cells
  RUNX1T1: 779 cells
  MEIS1: 776 cells
  MAPK1: 765 cells
  CNN1: 765 cells
  MAP7D1: 751 cells
  ZBTB25: 740 cells
  DLX2: 732 cells
  DUSP9: 731 cells
  FEV: 704 cells
  HOXB9: 701 cells
  ZBTB1: 693 cells
  TBX3: 691 cells
  TBX2: 690 cells
  PRTG: 685 cells
  IKZF3: 681 cells
  CLDN6: 670 cells
  SET_KLF1: 667 cells
  CBL: 663 cells
  CBFA2T3: 658 cells
  SET_CEBPE: 658 cells
  FOXL2: 652 cells
  ZNF318: 650 cells
  LHX1_ELMSAN1: 647 cells
  TGFBR2: 646 cells
  KIF2C: 634 cells
  FOSB: 630 cells
  MAML2: 626 cells
  LYL1_IER5L: 623 cells
  ZC3HAV1_HOXC13: 618 cells
  SPI1: 613 cells
  IGDCC3: 613 cells
  COL2A1: 598 cells
  BCL2L11: 582 cells
  ISL2: 582 cells
  MAP2K3_ELMSAN1: 581 cells
  MAP2K3: 579 cells
  MAP2K3_MAP2K6: 571 cells
  SLC6A9: 558 cells
  AHR: 558 cells
  UBASH3B_UBASH3A: 554 cells
  CDKN1B: 550 cells
  TMSB4X: 543 cells
  S1PR2: 541 cells
  BCORL1: 533 cells
  ZC3HAV1: 516 cells
  MAPK1_PRTG: 500 cells
  SAMD1: 498 cells
  MAPK1_TGFBR2: 497 cells
  ARRDC3: 495 cells
  FOXA3: 494 cells
  CEBPB: 488 cells
  SGK1: 483 cells
  AHR_KLF1: 481 cells
  CSRNP1: 478 cells
  CELF2: 475 cells
  CNNM4: 470 cells
  CEBPE_KLF1: 468 cells
  SNAI1: 467 cells
  BPGM: 464 cells
  GLB1L2: 463 cells
  BCL2L11_TGFBR2: 463 cells
  CEBPA: 460 cells
  ETS2_MAPK1: 459 cells
  MAP2K6_ELMSAN1: 451 cells
  ETS2_IKZF3: 445 cells
  PTPN12: 445 cells
  ETS2_PRTG: 445 cells
  PTPN9: 445 cells
  IRF1_SET: 444 cells
  ETS2_IGDCC3: 441 cells
  TSC22D1: 441 cells
  LYL1: 432 cells
  IRF1: 427 cells
  FOSB_UBASH3B: 426 cells
  PTPN1: 419 cells
  CEBPE_PTPN12: 415 cells
  CEBPB_MAPK1: 412 cells
  CBL_UBASH3B: 410 cells
  ZC3HAV1_CEBPE: 410 cells
  IER5L: 405 cells
  UBASH3B_CNN1: 402 cells
  CNN1_MAPK1: 398 cells
  KLF1_BAK1: 395 cells
  HOXC13: 385 cells
  FOSB_OSR2: 382 cells
  DUSP9_IGDCC3: 382 cells
  ATL1: 379 cells
  LHX1: 375 cells
  IGDCC3_MAPK1: 368 cells
  DUSP9_KLF1: 366 cells
  MAP4K5: 366 cells
  PTPN12_PTPN9: 364 cells
  MAP4K3: 363 cells
  KLF1_FOXA1: 361 cells
  CDKN1A: 360 cells
  POU3F2: 359 cells
  BPGM_ZBTB1: 358 cells
  SGK1_S1PR2: 358 cells
  ETS2_CEBPE: 354 cells
  SAMD1_PTPN12: 351 cells
  SGK1_TBX2: 349 cells
  CBL_CNN1: 348 cells
  FOSB_PTPN12: 345 cells
  PTPN12_OSR2: 339 cells
  MAP2K3_SLC38A2: 338 cells
  KLF1_TGFBR2: 337 cells
  CBL_PTPN12: 335 cells
  SLC38A2: 332 cells
  CEBPB_PTPN12: 331 cells
  RREB1: 331 cells
  FOXA3_HOXB9: 329 cells
  TMSB4X_BAK1: 329 cells
  COL1A1: 328 cells
  RHOXF2: 328 cells
  TGFBR2_C19orf26: 325 cells
  UBASH3B_PTPN9: 325 cells
  ETS2_MAP7D1: 324 cells
  EGR1: 322 cells
  RHOXF2_SET: 321 cells
  TGFBR2_ETS2: 318 cells
  PTPN12_UBASH3A: 311 cells
  KLF1_CEBPA: 311 cells
  STIL: 309 cells
  POU3F2_FOXL2: 309 cells
  CBL_PTPN9: 306 cells
  PTPN12_ZBTB25: 303 cells
  HK2: 303 cells
  MAPK1_IKZF3: 302 cells
  MAP2K6_SPI1: 302 cells
  BPGM_SAMD1: 301 cells
  TGFBR2_IGDCC3: 301 cells
  MAP2K6_IKZF3: 300 cells
  SGK1_TBX3: 300 cells
  SNAI1_UBASH3B: 299 cells
  KLF1_COL2A1: 298 cells
  UBASH3B_PTPN12: 298 cells
  MIDN: 298 cells
  PRDM1: 297 cells
  DUSP9_PRTG: 297 cells
  FOXA1_HOXB9: 295 cells
  DUSP9_MAPK1: 290 cells
  FEV_ISL2: 284 cells
  PTPN12_SNAI1: 283 cells
  KIAA1804: 278 cells
  FEV_MAP7D1: 277 cells
  AHR_FEV: 276 cells
  HOXA13: 274 cells
  FOXA1_FOXF1: 272 cells
  PTPN13: 270 cells
  JUN: 266 cells
  ZBTB10_PTPN12: 265 cells
  TGFBR2_PRTG: 265 cells
  C3orf72: 264 cells
  KLF1_MAP2K6: 263 cells
  RHOXF2_ZBTB25: 263 cells
  MAP2K3_IKZF3: 261 cells
  UBASH3B_ZBTB25: 255 cells
  KIF18B: 254 cells
  TP73: 248 cells
  FOSB_IKZF3: 247 cells
  ZNF318_FOXL2: 246 cells
  CKS1B: 241 cells
  KMT2A: 241 cells
  SAMD1_UBASH3B: 240 cells
  NCL: 237 cells
  CEBPE_CNN1: 237 cells
  HNF4A: 237 cells
  KLF1_CLDN6: 237 cells
  ARID1A: 232 cells
  CDKN1C: 230 cells
  FOXA3_FOXA1: 216 cells
  POU3F2_CBFA2T3: 216 cells
  CITED1: 215 cells
  FOXO4: 215 cells
  CEBPB_OSR2: 211 cells
  FOXF1_HOXB9: 200 cells
  FOXL2_MEIS1: 193 cells
  NIT1: 192 cells
  LYL1_CEBPB: 192 cells
  FOXA1_FOXL2: 192 cells
  FOXF1_FOXL2: 192 cells
  SAMD1_ZBTB1: 191 cells
  CBL_TGFBR2: 188 cells
  DUSP9_SNAI1: 183 cells
  ZBTB10_ELMSAN1: 182 cells
  CEBPE_CEBPA: 180 cells
  CEBPE_SPI1: 179 cells
  CNN1_UBASH3A: 178 cells
  BCL2L11_BAK1: 175 cells
  FOXA3_FOXF1: 175 cells
  ZBTB10: 162 cells
  FEV_CBFA2T3: 159 cells
  FOSB_CEBPE: 151 cells
  IGDCC3_ZBTB25: 141 cells
  IGDCC3_PRTG: 140 cells
  CDKN1B_CDKN1A: 134 cells
  SNAI1_DLX2: 131 cells
  FOXA3_FOXL2: 130 cells
  HES7: 126 cells
  CDKN1C_CDKN1A: 126 cells
  CEBPE_CEBPB: 114 cells
  PLK4: 113 cells
  CDKN1C_CDKN1B: 107 cells
  PRDM1_CBFA2T3: 106 cells
  ZBTB10_SNAI1: 100 cells

Cell type distribution:
  lymphoblasts: 110554 cells

Analysis completed!
[ ]:

[ ]: