Rationale
This markdown describes how the datasets were prepared
Step 1: Create DF with paths to gistic files
Goal: find the URLs of gistic files for the following cohorts
c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COADREAD", "COAD",
"DLBC", "ESCA", "FPPP", "GBMLGG", "GBM", "HNSC", "KICH", "KIPAN",
"KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD", "LUSC", "MESO",
"OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD",
"STES", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")
You can download all of these from firebrowse, but must know the tumor ‘type’ you’re looking for (primary solid tissue / primary from blood / metastatic. Most cohorts are primary tumour cohorts.
LAML TB (primary tumor from blood)
SKCM #TM sample code (metastatic)
See https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes for explanation of these types.
Then we build based on the following
https://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/{Cohort}-{Type}/20160128/gdac.broadinstitute.org_{Cohort}-{Type}.CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gz
dplyr::tibble(Cohort = c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COADREAD", "COAD",
"DLBC", "ESCA", "GBMLGG", "GBM", "HNSC", "KICH", "KIPAN",
"KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD", "LUSC", "MESO",
"OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD",
"STES", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")) |>
dplyr::mutate(
Type = dplyr::case_when(
Cohort == "LAML" ~ "TB", # primary tumor from blood
Cohort == "SKCM" ~ "TM", # metastatic
TRUE ~ "TP" # Primary Tumour
),
URL = glue::glue("https://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/{Cohort}-{Type}/20160128/gdac.broadinstitute.org_{Cohort}-{Type}.CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gz")
) |> readr::write_csv(file = paste0(system.file(package = "TCGAgisticDB"), "/gistic_urls.csv"))
# http://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/LAML-TB/20160128/gdac.broadinstitute.org_LAML-TB.CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gzStep 2: Download GISTIC data
We used the unexported function
gistic_download_all_cohorts(destdir = system.file('gistic_raw/', package = "TCGAgisticDB"))
to download tar-zipped gistic outfiles for each available TCGA cohort
from firebrowse.
We download files from: https://gdac.broadinstitute.org/runs/analyses__latest/data/
Specifically we download
gdac.broadinstitute.org_<cohort>-TP.CopyNumber_Gistic2.Level_4.<analsis_date>.0.0.tar.gz
gistic_download_all_cohorts(destdir = system.file('gistic_raw/', package = "TCGAgisticDB"))Then we use interchange::gistic_tar_to_crux() on each
download to produce our Rds files. We produce 1 for each setting
(deep/shallow/all).
library(interchange)
path_raw_gistic <- system.file('gistic_raw/', package = "TCGAgisticDB")
path_gistic_tarzips <- dir(path_raw_gistic, pattern = "tar.gz", full.names = TRUE)
df_outfile_info <- dplyr::tibble(
GisticTarZips = path_gistic_tarzips,
Cohort = stringr::str_extract(string = GisticTarZips, pattern = "gdac.broadinstitute.org_[a-zA-Z]+?-") |>
sub(x= _, "gdac.broadinstitute.org_", "") |>
sub(x=_, "-$", ""),
outfile_prefix = paste0(Cohort, "_", "Firehose"),
outfile_dir = paste0(system.file("gistic_rds", package = "TCGAgisticDB"), "/")
)
for (i in seq_len(nrow(df_outfile_info))){
for (cnlevel in c('all', 'shallow', 'deep')){
outfile_dir <- df_outfile_info[["outfile_dir"]][i]
outfile_prefix <- df_outfile_info[["outfile_prefix"]][i]
gistic_tar <- df_outfile_info[["GisticTarZips"]][i]
outfile_path <- paste0(outfile_dir, '/', outfile_prefix, '.cnlevel_', cnlevel, '.rds')
message("Writing: ", basename(outfile_path))
tryCatch(
expr = {convert_gistic_tar_to_crux(gistic_tar = gistic_tar, outfile = outfile_path, isTCGA = TRUE, cnLevel = cnlevel) },
error = function(err){ warning ("skipping ", outfile_path, " since reading failed") },
warning = function(warn) { }
)
}
}
Then we just create a csv that maps cohort to RDS file
df_studynames <- utils::read.csv(system.file("tcga_study_abbreviations.csv", package = "TCGAgisticDB"), strip.white = TRUE)
df_doi <- utils::read.csv(system.file("doi.csv", package = "TCGAgisticDB"), strip.white = TRUE)
outfile_dir = paste0(system.file("gistic_rds", package = "TCGAgisticDB"), "/")
rds_files <- dir(outfile_dir, pattern = "rds")
df_data <- dplyr::tibble(
Filepath = rds_files
) |>
tidyr::extract(col = Filepath, into = c("Cohort", "Source", "CopyNumberLevel"), regex = "([a-zA-Z]+?)_([a-zA-Z]+?)\\.cnlevel_([a-zA-Z]+?)\\.rds", remove = FALSE)
df_data <- df_data |>
dplyr::mutate(FullName = df_studynames$study_name[match(Cohort, df_studynames$abbreviation)])
#df_data |> dplyr::group_by(Cohort) |> dplyr::filter(!"all" %in% CopyNumberLevel)
# Add doi
df_data <- df_data |>
dplyr::mutate(DOI = df_doi$DOI[match(Cohort, df_doi$Cohort)])
df_data |>
dplyr::select(Cohort, FullName, Source, CopyNumberLevel, Filepath, DOI) |>
readr::write_csv(paste0(system.file(package = "TCGAgisticDB"), "/cohorts.csv"))