library(coreheat)
#> Loading required package: Biobase
#> Loading required package: BiocGenerics
#> Loading required package: generics
#>
#> Attaching package: 'generics'
#> The following objects are masked from 'package:base':
#>
#> as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
#> setequal, union
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#>
#> Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
#> as.data.frame, basename, cbind, colnames, dirname, do.call,
#> duplicated, eval, evalq, get, grep, grepl, is.unsorted, lapply,
#> mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
#> rank, rbind, rownames, sapply, saveRDS, table, tapply, unique,
#> unsplit, which.max, which.min
#> Welcome to Bioconductor
#>
#> Vignettes contain introductory material; view with
#> 'browseVignettes()'. To cite Bioconductor, see
#> 'citation("Biobase")', and for packages 'citation("pkgname")'.Create correlation heatmaps from a numeric matrix. Ensembl Gene ID row names can be converted to Gene Symbols using, e.g., BioMart. Optionally, data can be clustered and filtered by correlation, tree cutting and/or number of missing values. Genes of interest can be highlighted in the plot and correlation significance be indicated by asterisks encoding corresponding P-Values. Plot dimensions and label measures are adjusted automatically by default. The plot features rely on the heatmap.n2() function in the ‘heatmapFlex’ package.
Note that these examples will run faster with
database package org.Hs.eg.db. However, since this package
is not available from CRAN and hence cannot be a strong dependency the
code will fall back to using BioMart as annotation resource if
org.Hs.eg.db is not installed. In some examples BioMart is
used intentionally.
Generate a random 10x10 matrix with two distinct sets and plot it with default settings without ID conversion since the IDs are made up.
set.seed(1234)
mat <- matrix(c(rnorm(100, mean = 1), rnorm(100, mean = -1)), nrow = 20)
rownames(mat) <- paste0("gene-", 1:20)
colnames(mat) <- paste0(c("A", "B"), rep(1:5, 2))
cormap2(mat, convert = FALSE, main = "Random matrix")A dataset consisting of 333
patients with primary prostate cancer was downloaded from The Cancer
Genome Atlas (TCGA) via the cBioPortal for Cancer Genomics and two
subsets were created, PrCaTCGASample.txt containing
genes commonly associated with cancer, and
PrCaTCGASample_500.txt containing 500 genes selected at
random.
Function is used to convert Ensembl Gene IDs to HGNC Symbols.
In the following examples, we use the smaller of the two to demonstrate
the basic functionality of the package.
26 genes commonly associated with cancer in general were selected to provide a small demonstration data set
## check for 'org.Hs.eg.db'
if (!requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
bm <- TRUE
} else {
bm <- FALSE
}
#>
## Read data and prepare input data frame
fl <- system.file("extdata", "PrCaTCGASample.txt", package = "coreheat", mustWork = TRUE)
dat0 <- read.delim(fl, stringsAsFactors=FALSE)
dat1 <- data.frame(dat0[, grep("TCGA", names(dat0))], row.names=dat0$ensembl_gene_id)
cormap2(dat1, main="TCGA data frame + ID conversion", biomart = bm, use.cache = FALSE)Separately supplied IDs are used with a matrix created from the data frame of the previous example and genes of interest are highlighted inside the heatmap to emphasise individual pair-wise correlations.
dat2 <- as.matrix(dat0[, grep("TCGA", names(dat0))])
sym <- dat0$hgnc_symbol
cormap2(dat1, convert = FALSE, lab = sym, genes2highl = c("GNAS","NCOR1","AR", "ATM"),
main="TCGA matrix + custom labels")The ExpressionSet object is created using the numeric data frame created above from the small TCGA dataset.
## check for 'org.Hs.eg.db'
if (!requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
bm <- TRUE
} else {
bm <- FALSE
}
expr <- Biobase::ExpressionSet(as.matrix(dat1))
cormap2(expr, add.sig=TRUE, autoadj=FALSE, cex=1, main="TCGA ExpressionSet object + ID conversion", biomart = bm, use.cache = FALSE)500 genes were selected at random to demonstrate handling of larger
datasets. We will first plot the entire matrix and then use the
filtering capabilities of cormap2 to identify possibly
co-expressed genes. ##### Full map
fl <- system.file("extdata", "PrCaTCGASample_500.txt", package = "coreheat", mustWork = TRUE)
dat0_500 <- read.delim(fl, stringsAsFactors=FALSE)
dat1_500 <- data.frame(dat0_500[, grep("TCGA", names(dat0_500))], row.names=dat0_500$ensembl_gene_id)
cormap2(dat1_500, biomart = TRUE, use.cache = FALSE, main="TCGA 500 genes")Dendrogram branches are cut using function cutreeStatic
from the WGCNA package. This will reduce the number of
branches and, in consequence, the size of the correlation map, while
retaining closely correlated values.
Note that getting the values for cut.thr and
cut-size right can take a bit trial and error.
The number of rows left after filtering is added automatically if
cutting is enabled. To remove that sub-title set postfix to
FALSE.
We also highlight a few genes that all appear to have a high correlation
to see if we find them again in the next plot.
cm <- cormap2(dat1_500, cut.thr = 1.5, cut.size = 40, biomart = TRUE, use.cache = FALSE,
genes2highl = c("RPLP1", "NOP53", "RPL13", "RPS17", "RPL11", "RPS24", "RPL30", "EEF1D", "RPL27",
"SNHG29", "HSPA1A", "HSPA1B", "DNAJB1", "BHLHE40", "NR4A1", "JUND", "HLA-A", "HLA-B",
"BTF3", "EEF1A1P9"),
main="TCGA 500 genes", verbose = TRUE)
#> @ Calculating correlation matrix...
#> Computing correlations...
#> Calculating P-Values...
#> Getting NAs...
#> Getting counts...
#> Getting P-Values...
#> @ Clustering and filtering correlation matrix...
#> Input is a list with correlation matrix and pvalues. Assuming non-clustered matrix.
#> Filtering NAs...
#> Getting distance matrix and hierarchical clustering...
#> Cutting the dendrogram at 1.5...
#> Isolated tree section with 185 genes
#> Recalculating distance matrix and hierarchical clustering...
#> Ordering according to hierarchical clustering...
#> @ Formatting output matrix...
#> Checking input IDs:
#> Attempting to convert input matrix row names...
#> Getting CURL SSL options for securely contacting host 'https://www.ensembl.org'...
#> Using BioMart: 'ENSEMBL_MART_ENSEMBL'
#> Input ID type is 'ensembl_gene_id'
#> Information requested: 'hgnc_symbol'...
#> Merging input IDs and converted IDs...
#> done
#> Replacing 2 missing Gene Symbol(s) by Ensembl IDs...
#> @ Generating heatmap...
#> Ordering Y-axis...
This effectively zooms into the upper right corner of the large heatmap.
In the next step we reduce the branch height to get regions with even
shorter distances.
To start where we left off we use the correlation matrix produced in the
previous step. We set the cut threshold quite low to get the shortest
distances, only, and the number of objects per branch (needed to be
considered a cluster) to a small value, too, to also find smaller
regions.
cm2 <- cormap2(cormat = cm, cut.thr = 1.2, cut.size = 20, biomart = TRUE, use.cache = FALSE,
genes2highl = c("RPLP1", "NOP53", "RPL13", "RPS17", "RPL11", "RPS24", "RPL30", "EEF1D", "RPL27",
"SNHG29", "HSPA1A", "HSPA1B", "DNAJB1", "BHLHE40", "NR4A1", "JUND", "HLA-A", "HLA-B",
"BTF3", "EEF1A1P9"),
main="TCGA 500 genes", cex = 0.9, verbose = TRUE)
#> Input is a complete list with correlation matrix and P-Values.
#> @ Clustering and filtering correlation matrix...
#> Input is a list with correlation matrix and pvalues. Assuming non-clustered matrix.
#> Filtering NAs...
#> Getting distance matrix and hierarchical clustering...
#> Cutting the dendrogram at 1.2...
#> Isolated tree section with 75 genes
#> Recalculating distance matrix and hierarchical clustering...
#> Ordering according to hierarchical clustering...
#> @ Formatting output matrix...
#> Checking input IDs:
#> Attempting to convert input matrix row names...
#> Getting CURL SSL options for securely contacting host 'https://www.ensembl.org'...
#> Using BioMart: 'ENSEMBL_MART_ENSEMBL'
#> Input ID type is 'ensembl_gene_id'
#> Information requested: 'hgnc_symbol'...
#> Merging input IDs and converted IDs...
#> done
#> Replacing 1 missing Gene Symbol(s) by Ensembl IDs...
#> @ Generating heatmap...
#> Ordering Y-axis...
The genes we highlighted in the large map are found here, too. Filtering
by branch height reduced the matrix further “picking” regions with
similarly correlating values, in our case the bottom left block of
positively correlated genes.
But what if we want to filter for only highly correlating values? The package offers two approaches to pick “clusters” of values with a correlation above a certain threshold, a sliding window approach where the map is searched for the threshold and then the window is expanded as long as the threshold is met, and a simple margin approach where the threshold must be met in a given percentage of the rows.
Sliding window The user can set the size of the
window, the correlation threshold and the “number” of the cluster to be
picked. The cluster “number” here is a subjective value chosen after
inspecting the source heatmap by eye and starting to count from the
bottom left. For example, the first cluster with a correlation above 0.6
should give us the darker red block in the bottom left corner of the
first map (object cm). Hence we set the window size to 5
and ask the function to highlight all the genes it finds from the same
list as above. We add significance asterisks to see how good our
correlation is.
cm3 <- cormap2(cormat = cm, cor.thr = 0.6, cor.window = 5, cor.cluster = 1, biomart = TRUE, use.cache = FALSE,
genes2highl = c("RPLP1", "NOP53", "RPL13", "RPS17", "RPL11", "RPS24", "RPL30", "EEF1D", "RPL27",
"SNHG29", "HSPA1A", "HSPA1B", "DNAJB1", "BHLHE40", "NR4A1", "JUND", "HLA-A", "HLA-B",
"BTF3", "EEF1A1P9"),
add.sig = TRUE, cex = 2, main="TCGA 500 genes", verbose = TRUE)
#> Input is a complete list with correlation matrix and P-Values.
#> @ Clustering and filtering correlation matrix...
#> Input is a list with correlation matrix and pvalues. Assuming non-clustered matrix.
#> Filtering NAs...
#> Getting distance matrix and hierarchical clustering...
#> Ordering according to hierarchical clustering...
#> Filtering by correlation...
#> Filtering by correlation threshold in a window of 5
#> Cycle 2 -> Found 1. cluster of correlating values (cor. threshold 0.6)
#> -> Expanding window...
#> -> done.
#> Filtered 170 rows not meeting correlation threshold (0.6 in 50% of the columns)
#> @ Formatting output matrix...
#> Checking input IDs:
#> Attempting to convert input matrix row names...
#> Getting CURL SSL options for securely contacting host 'https://www.ensembl.org'...
#> Using BioMart: 'ENSEMBL_MART_ENSEMBL'
#> Input ID type is 'ensembl_gene_id'
#> Information requested: 'hgnc_symbol'...
#> Merging input IDs and converted IDs...
#> done
#> @ Generating heatmap...
#> Ordering Y-axis...
This approach picked the block we expected.
To find negative or positive correlations, the function offers the possibility to filter by correlation threshold and a margin of rows in which the threshold is met. We start with the first matrix above and set the margin to 0.1 which corresponds to 10% of the rows. Note that this will pick all possible rows, not only adjacent ones. Positive correlation
cm3 <- cormap2(cormat = cm, cor.thr = 0.6, cor.mar = 0.3, biomart = TRUE, use.cache = FALSE,
genes2highl = c("RPLP1", "NOP53", "RPL13", "RPS17", "RPL11", "RPS24", "RPL30", "EEF1D", "RPL27",
"SNHG29", "HSPA1A", "HSPA1B", "DNAJB1", "BHLHE40", "NR4A1", "JUND", "HLA-A",
"HLA-B"),
main="TCGA 500 genes", cex = 1, verbose = T)
#> Input is a complete list with correlation matrix and P-Values.
#> @ Clustering and filtering correlation matrix...
#> Input is a list with correlation matrix and pvalues. Assuming non-clustered matrix.
#> Filtering NAs...
#> Getting distance matrix and hierarchical clustering...
#> Ordering according to hierarchical clustering...
#> Filtering by correlation...
#> Filtering by correlation threshold with a margin of 30%
#> Filtered 120 rows not meeting correlation threshold (0.6 in 30% of the columns)
#> @ Formatting output matrix...
#> Checking input IDs:
#> Attempting to convert input matrix row names...
#> Getting CURL SSL options for securely contacting host 'https://www.ensembl.org'...
#> Using BioMart: 'ENSEMBL_MART_ENSEMBL'
#> Input ID type is 'ensembl_gene_id'
#> Information requested: 'hgnc_symbol'...
#> Merging input IDs and converted IDs...
#> done
#> Replacing 1 missing Gene Symbol(s) by Ensembl IDs...
#> @ Generating heatmap...
#> Ordering Y-axis...
This also extracted a region from the positively correlated region at
the bottom left of the original matrix but only the genes correlating
above the threshold.
Negative correlation Since there are less and less extreme negative correlations in the matrix we have to set the thresholds to very lax values.
## check for 'org.Hs.eg.db'
if (!requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
bm <- TRUE
} else {
bm <- FALSE
}
cm3 <- cormap2(cormat = cm, cor.thr = -0.2, cor.mar = 0.2, biomart = bm, use.cache = FALSE,
genes2highl = c("RPLP1", "NOP53", "RPL13", "RPS17", "RPL11", "RPS24", "RPL30", "EEF1D", "RPL27",
"SNHG29", "HSPA1A", "HSPA1B", "DNAJB1", "BHLHE40", "NR4A1", "JUND", "HLA-A",
"HLA-B"),
add.sig = TRUE, cex = 2, main="TCGA 500 genes")The function cormap_filt splits (cuts) a dendrogram of a
hierarchically clustered distance matrix at a given threshold dividing
it into larger or smaller “sub-clusters”. Correlation P-Values are
converted to represent significance as a sub-cluster-wise signal metric
used for filtering. Optionally, up to 3 plots are produced, the third
one being a filtered heatmap based on significance and three height
cutting.
cormap_filt is a convenience function that accepts an
ExpressionSet, a data frame or a numeric matrix as input. Note, that it
will not work on a correlation matrix, as it needs the P-Values
calculated internally for filtering.
## check for 'org.Hs.eg.db'
if (!requireNamespace("org.Hs.eg.db", quietly = TRUE)) {
bm <- TRUE
} else {
bm <- FALSE
}
cormap_filt(dat1_500, main = "TCGA 500 genes", cut.thr = 21, p.thr = 0.01, cex.filt = 0.8, biomart = bm, use.cache = FALSE)