Prepariing GISTIC data • TCGAgisticDB

Rationale

This markdown describes how the datasets were prepared

Step 1: Create DF with paths to gistic files

Goal: find the URLs of gistic files for the following cohorts

c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COADREAD", "COAD", 
"DLBC", "ESCA", "FPPP", "GBMLGG", "GBM", "HNSC", "KICH", "KIPAN", 
"KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD", "LUSC", "MESO", 
"OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", 
"STES", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")

You can download all of these from firebrowse, but must know the tumor ‘type’ you’re looking for (primary solid tissue / primary from blood / metastatic. Most cohorts are primary tumour cohorts.

LAML  TB (primary tumor from blood)
SKCM  #TM sample code (metastatic)

See https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes for explanation of these types.

Then we build based on the following

https://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/{Cohort}-{Type}/20160128/gdac.broadinstitute.org_{Cohort}-{Type}.CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gz

dplyr::tibble(Cohort = c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COADREAD", "COAD", 
  "DLBC", "ESCA", "GBMLGG", "GBM", "HNSC", "KICH", "KIPAN", 
  "KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD", "LUSC", "MESO", 
  "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", 
  "STES", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")) |>
  dplyr::mutate(
    Type = dplyr::case_when(
      Cohort == "LAML" ~ "TB", # primary tumor from blood
      Cohort == "SKCM" ~ "TM", # metastatic
      TRUE ~ "TP" # Primary Tumour
      ),
    URL = glue::glue("https://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/{Cohort}-{Type}/20160128/gdac.broadinstitute.org_{Cohort}-{Type}.CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gz")
    ) |> readr::write_csv(file = paste0(system.file(package = "TCGAgisticDB"), "/gistic_urls.csv"))

  # http://gdac.broadinstitute.org/runs/analyses__2016_01_28/data/LAML-TB/20160128/gdac.broadinstitute.org_LAML-TB.CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gz

Step 2: Download GISTIC data

We used the unexported function gistic_download_all_cohorts(destdir = system.file('gistic_raw/', package = "TCGAgisticDB")) to download tar-zipped gistic outfiles for each available TCGA cohort from firebrowse.

We download files from: https://gdac.broadinstitute.org/runs/analyses__latest/data/

Specifically we download gdac.broadinstitute.org_<cohort>-TP.CopyNumber_Gistic2.Level_4.<analsis_date>.0.0.tar.gz

gistic_download_all_cohorts(destdir = system.file('gistic_raw/', package = "TCGAgisticDB"))

Then we use interchange::gistic_tar_to_crux() on each download to produce our Rds files. We produce 1 for each setting (deep/shallow/all).

library(interchange)

path_raw_gistic <- system.file('gistic_raw/', package = "TCGAgisticDB")

path_gistic_tarzips <- dir(path_raw_gistic, pattern = "tar.gz", full.names = TRUE)

df_outfile_info <- dplyr::tibble(
  GisticTarZips = path_gistic_tarzips,
  Cohort = stringr::str_extract(string = GisticTarZips, pattern = "gdac.broadinstitute.org_[a-zA-Z]+?-") |>
    sub(x= _, "gdac.broadinstitute.org_", "") |>
    sub(x=_, "-$", ""),
  outfile_prefix = paste0(Cohort, "_", "Firehose"),
  outfile_dir = paste0(system.file("gistic_rds", package = "TCGAgisticDB"), "/")
  )

for (i in seq_len(nrow(df_outfile_info))){
  for (cnlevel in c('all', 'shallow', 'deep')){
    outfile_dir <- df_outfile_info[["outfile_dir"]][i]
    outfile_prefix <- df_outfile_info[["outfile_prefix"]][i]
    gistic_tar <- df_outfile_info[["GisticTarZips"]][i]
    
    outfile_path <- paste0(outfile_dir, '/', outfile_prefix, '.cnlevel_', cnlevel, '.rds')
    message("Writing: ", basename(outfile_path))
    
    tryCatch(
      expr = {convert_gistic_tar_to_crux(gistic_tar = gistic_tar, outfile = outfile_path, isTCGA = TRUE, cnLevel = cnlevel) },
      error = function(err){ warning ("skipping ", outfile_path, " since reading failed") },
      warning = function(warn) { }
    )
  }
  
}

Then we just create a csv that maps cohort to RDS file

df_studynames <- utils::read.csv(system.file("tcga_study_abbreviations.csv", package = "TCGAgisticDB"), strip.white = TRUE)

df_doi <- utils::read.csv(system.file("doi.csv", package = "TCGAgisticDB"), strip.white = TRUE)

outfile_dir = paste0(system.file("gistic_rds", package = "TCGAgisticDB"), "/")

rds_files <- dir(outfile_dir, pattern = "rds")

df_data <-  dplyr::tibble(
  Filepath = rds_files
  ) |>
  tidyr::extract(col = Filepath, into = c("Cohort", "Source", "CopyNumberLevel"), regex = "([a-zA-Z]+?)_([a-zA-Z]+?)\\.cnlevel_([a-zA-Z]+?)\\.rds", remove = FALSE)


df_data <- df_data |>
  dplyr::mutate(FullName = df_studynames$study_name[match(Cohort, df_studynames$abbreviation)])

#df_data |> dplyr::group_by(Cohort) |> dplyr::filter(!"all" %in% CopyNumberLevel)

# Add doi
df_data <- df_data |>
  dplyr::mutate(DOI = df_doi$DOI[match(Cohort, df_doi$Cohort)])

df_data |>
  dplyr::select(Cohort, FullName, Source, CopyNumberLevel, Filepath, DOI) |>
  readr::write_csv(paste0(system.file(package = "TCGAgisticDB"), "/cohorts.csv"))