Jupyter Notebook

Integrate scRNA-seq datasets#

!lamin load test-scrna
Hide code cell output
πŸ’‘ found cached instance metadata: /home/runner/.lamin/instance--testuser1--test-scrna.env
βœ… loaded instance: testuser1/test-scrna

import lamindb as ln
import lnschema_bionty as lb
import pandas as pd
import anndata as ad
βœ… loaded instance: testuser1/test-scrna (lamindb 0.51.0)
ln.track()
πŸ’‘ notebook imports: anndata==0.9.2 lamindb==0.51.0 lnschema_bionty==0.30.0 pandas==1.5.3
βœ… saved: Transform(id='agayZTonayqAz8', name='Integrate scRNA-seq datasets', short_name='scrna2', version='0', type=notebook, updated_at=2023-08-28 14:45:02, created_by_id='DzTjkKse')
βœ… saved: Run(id='ZquoFtvuGCyYpC8uYnXJ', run_at=2023-08-28 14:45:02, transform_id='agayZTonayqAz8', created_by_id='DzTjkKse')

Query files based on metadata#

# lookup objects for auto-complete
assays = lb.ExperimentalFactor.lookup()
species = lb.Species.lookup()
query = ln.File.filter(
    experimental_factors=assays.single_cell_rna_sequencing,  # scRNA-seq
    species=species.human,  # human
    cell_types__name__contains="monocyte",  # monocyte
).distinct()
query.df()
storage_id key suffix accessor description version initial_version_id size hash hash_type transform_id run_id updated_at created_by_id
id
hfXqZMM05SbQ81R9UfMO 2IsORhwz None .h5ad AnnData Conde22 None None 28049505 WEFcMZxJNmMiUOFrcSTaig md5 Nv48yAceNSh8z8 F4GBwmj0SIFENKkZv60V 2023-08-28 14:44:45 DzTjkKse
xIiOS2YIDPIcL2pDtrkv 2IsORhwz None .h5ad AnnData 10x reference pbmc68k None None 589484 eKVXV5okt5YRYjySMTKGEw md5 Nv48yAceNSh8z8 F4GBwmj0SIFENKkZv60V 2023-08-28 14:44:56 DzTjkKse

Intersect measured genes between two datasets#

# get file objects
file1, file2 = query.list()
file1.describe()
πŸ’‘ File(id='hfXqZMM05SbQ81R9UfMO', key=None, suffix='.h5ad', accessor='AnnData', description='Conde22', version=None, size=28049505, hash='WEFcMZxJNmMiUOFrcSTaig', hash_type='md5', created_at=2023-08-28 14:44:45, updated_at=2023-08-28 14:44:45)

Provenance:
    πŸ—ƒοΈ storage: Storage(id='2IsORhwz', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-08-28 14:45:01, created_by_id='DzTjkKse')
    πŸ“” transform: Transform(id='Nv48yAceNSh8z8', name='Validate & register scRNA-seq datasets', short_name='scrna', version='0', type='notebook', updated_at=2023-08-28 14:44:56, created_by_id='DzTjkKse')
    πŸ‘£ run: Run(id='F4GBwmj0SIFENKkZv60V', run_at=2023-08-28 14:44:10, transform_id='Nv48yAceNSh8z8', created_by_id='DzTjkKse')
    πŸ‘€ created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-08-28 14:45:01)
Features:
  var (X):
    πŸ”— index (36503, bionty.Gene.id): ['jLdPEYgxDYdD', 'tLMCNI2bAIMf', 'KvGLKQNvuLuM', 't3UILJ5yRQW0', '0bSbiab6H04M'...]
  obs (metadata):
    πŸ”— cell_type (32, bionty.CellType): ['CD8-positive, alpha-beta memory T cell', 'macrophage', 'mast cell', 'group 3 innate lymphoid cell', 'effector memory CD4-positive, alpha-beta T cell']
    πŸ”— assay (4, bionty.ExperimentalFactor): ["10x 5' v1", "10x 3' v3", "10x 5' v2", 'single-cell RNA sequencing']
    πŸ”— tissue (17, bionty.Tissue): ['blood', 'transverse colon', 'liver', 'duodenum', 'ileum']
    πŸ”— donor (12, core.Label): ['A37', '582C', '640C', 'A35', '637C']
file1.view_lineage()
https://d33wubrfki0l68.cloudfront.net/b07a3b2fbe98c9922152dd6503aa8bbf9b693f78/1bed9/_images/f45d9783cd4ec03badefffd1bab02e7b36f747a6f35775a27366e25904ed39ff.svg
file2.describe()
πŸ’‘ File(id='xIiOS2YIDPIcL2pDtrkv', key=None, suffix='.h5ad', accessor='AnnData', description='10x reference pbmc68k', version=None, size=589484, hash='eKVXV5okt5YRYjySMTKGEw', hash_type='md5', created_at=2023-08-28 14:44:56, updated_at=2023-08-28 14:44:56)

Provenance:
    πŸ—ƒοΈ storage: Storage(id='2IsORhwz', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-08-28 14:45:01, created_by_id='DzTjkKse')
    πŸ“” transform: Transform(id='Nv48yAceNSh8z8', name='Validate & register scRNA-seq datasets', short_name='scrna', version='0', type='notebook', updated_at=2023-08-28 14:44:56, created_by_id='DzTjkKse')
    πŸ‘£ run: Run(id='F4GBwmj0SIFENKkZv60V', run_at=2023-08-28 14:44:10, transform_id='Nv48yAceNSh8z8', created_by_id='DzTjkKse')
    πŸ‘€ created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-08-28 14:45:01)
Features:
  var (X):
    πŸ”— index (695, bionty.Gene.id): ['bmeBMqKh8Ik7', 'zOUVvOZ5PDec', 'brVWiXu4ddF0', 'JHde4Fz7gUF4', '20169ReyBOCR'...]
  external:
    πŸ”— assay (1, bionty.ExperimentalFactor): ['single-cell RNA sequencing']
    πŸ”— species (1, bionty.Species): ['human']
  obs (metadata):
    πŸ”— cell_type (9, bionty.CellType): ['dendritic cell', 'conventional dendritic cell', 'CD14-positive, CD16-negative classical monocyte', 'CD16-positive, CD56-dim natural killer cell, human', 'cytotoxic T cell']
file2.view_lineage()
https://d33wubrfki0l68.cloudfront.net/59b0115df5d688e1205527648e31758c8c5455ac/3c6af/_images/e2918011c56ffd02477d72459a2add22451b2cde61ba7114fc6a9d65767e1fd9.svg

Load files into memory:

file1_adata = file1.load()
file2_adata = file2.load()
πŸ’‘ adding file hfXqZMM05SbQ81R9UfMO as input for run ZquoFtvuGCyYpC8uYnXJ, adding parent transform Nv48yAceNSh8z8
πŸ’‘ adding file xIiOS2YIDPIcL2pDtrkv as input for run ZquoFtvuGCyYpC8uYnXJ, adding parent transform Nv48yAceNSh8z8

Here we compute shared genes without loading files:

file1_genes = file1.features["var"]
file2_genes = file2.features["var"]

shared_genes = file1_genes & file2_genes
len(shared_genes)
695
shared_genes.list("symbol")[:10]
['CD164',
 'HNRNPK',
 'CCDC107',
 'EGFL7',
 'EIF3G',
 'CST7',
 'RNF181',
 'OAS1',
 'ABI1',
 'KAT5']

We also need to convert the ensembl_gene_id to symbol for file2 so that they can be concatenated:

mapper = pd.DataFrame(shared_genes.values_list("ensembl_gene_id", "symbol")).set_index(
    0
)[1]
mapper.head()
0
ENSG00000135535      CD164
ENSG00000165119     HNRNPK
ENSG00000159884    CCDC107
ENSG00000172889      EGFL7
ENSG00000130811      EIF3G
Name: 1, dtype: object
file2_adata.var.rename(index=mapper, inplace=True)

Intersect cell types#

file1_celltypes = file1.cell_types.all()
file2_celltypes = file2.cell_types.all()

shared_celltypes = file1_celltypes & file2_celltypes
shared_celltypes_names = shared_celltypes.list("name")
shared_celltypes_names
['CD16-positive, CD56-dim natural killer cell, human',
 'conventional dendritic cell']

We can now subset the two datasets by shared cell types:

file1_adata_subset = file1_adata[
    file1_adata.obs["cell_type"].isin(shared_celltypes_names)
]

file2_adata_subset = file2_adata[
    file2_adata.obs["cell_type"].isin(shared_celltypes_names)
]

Concatenate subseted datasets:

adata_concat = ad.concat(
    [file1_adata_subset, file2_adata_subset],
    label="file",
    keys=[file1.description, file2.description],
)
adata_concat
AnnData object with n_obs Γ— n_vars = 126 Γ— 0
    obs: 'cell_type', 'file'
    obsm: 'X_umap'
adata_concat.obs.value_counts()
cell_type                                           file                 
CD16-positive, CD56-dim natural killer cell, human  Conde22                  114
conventional dendritic cell                         Conde22                    7
CD16-positive, CD56-dim natural killer cell, human  10x reference pbmc68k      3
conventional dendritic cell                         10x reference pbmc68k      2
dtype: int64
Hide code cell content
# clean up test instance
!lamin delete --force test-scrna
!rm -r ./test-scrna
πŸ’‘ deleting instance testuser1/test-scrna
βœ…     deleted instance settings file: /home/runner/.lamin/instance--testuser1--test-scrna.env
βœ…     instance cache deleted
βœ…     deleted '.lndb' sqlite file
❗     consider manually deleting your stored data: /home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna