Read TCGA data to the tidy Format

readTCGA function allows to read unzipped files:

clinical data - Merge_Clinical.Level_1
rnaseq data (genes' expressions) - rnaseqv2__illuminahiseq_rnaseqv2
genes' mutations data - Mutation_Packager_Calls.Level
Reverse phase protein array data (RPPA) - protein_normalization__data.Level_3
Merge transcriptome agilent data (mRNA) - Merge_transcriptome__agilentg4502a_07_3__unc_edu__Level_3__unc_lowess_normalization_gene_level__data.Level_3
miRNASeq data - Merge_mirnaseq__illuminaga_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3 or "Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3"
methylation data - Merge_methylation__humanmethylation27
isoforms data - Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_isoforms_normalized__data.Level_3
CNV data - segmented_scna_minus_germline_cnv_hg19

from TCGA project. Those files can be easily downloded with downloadTCGA function. See examples.

readTCGA(path, dataType, ...)

Arguments

path: See details and examples.
dataType: One of 'clinical', 'rnaseq', 'mutations', 'RPPA', 'mRNA', 'miRNASeq', 'methylation', 'isoforms', 'CNV' depending on which type of data user is trying to read in the tidy format.
...: Further arguments passed to the as.data.frame.

Value

An output is a data.frame with dataType data.

Details

All cohort names can be checked using: sub( x = names( infoTCGA() ), '-counts', '').

Parameter path specification:

If dataType = 'clinical' a path to a cancerType.clin.merged.txt file.
If dataType = 'mutations' a path to the unzziped folder Mutation_Packager_Calls.Level containing .maf files.
If dataType = 'rnaseq' a path to the uzziped file rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level.
If dataType = 'RPPA' a path to the unzipped file in folder protein_normalization__data.Level_3.
If dataType = 'mRNA' a path to the unzipped file cancerType.transcriptome__agilentg4502a_07_3__unc_edu__Level_3__unc_lowess_normalization_gene_level__data.data.txt.
If dataType = 'miRNASeq' a path to unzipped files cancerType.mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.data.txt or cancerType.mirnaseq__illuminaga_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.data.txt
If dataType = 'methylation' a path to unzipped files cancerType.methylation__humanmethylation27__jhu_usc_edu__Level_3__within_bioassay_data_set_function__data.data.txt.
If dataType = 'isoforms' a path to unzipped files cancerType.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_isoforms_normalized__data.data.txt.
If dataType = 'CNV' a path to unzipped files cancerType.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg18__seg.Level_3.txt.

Issues

If you have any problems, issues or think that something is missing or is not clear please post an issue on https://github.com/RTCGA/RTCGA/issues.

Examples


## Not run: ------------------------------------
#  
# 
# ##############
# ##### clinical
# ##############
# 
# dir.create('data')
# 
# # downloading clinical data
# # dataset = "clinical" is default parameter so we may omit it
# downloadTCGA(cancerTypes = c('BRCA', 'OV'),
#              destDir = 'data' )
# # shorten paths so that they are shorter than 256 signs - windows issue
#  list.files("data", full.names = TRUE) %>%
#    file.rename(to = substr(., start = 1, stop = 50))
#     
# # reading datasets    
# sapply(c('BRCA', 'OV'), function(element){
#  path <- list.files('data', recursive = TRUE,
#                     full.names = TRUE, 
#                     patten = "clin.merged.txt")
#  assign(value = readTCGA( path, 'clinical' ), 
#         x = paste0(element, '.clin.data'),
#         envir = .GlobalEnv)})
#      
# ############
# ##### rnaseq
# ############
# 
# dir.create('data2')
# 
# # downloading rnaseq data
# downloadTCGA(cancerTypes = 'BRCA', 
#              dataSet = 'Level_3__RSEM_genes_normalized',
#              destDir = 'data2')
# 
# # shorten paths so that they are shorter than 256 signs - windows issue
# list.files("data2", full.names = TRUE) %>%
#    file.rename(to = substr(., start = 1, stop = 50))
# 
# path_rnaseq <- list.files('data2', recursive = TRUE,
#                           full.names = TRUE, 
#                           patten = 'illuminahiseq')
# readTCGA(path = pathRNA, dataType = 'rnaseq') -> rnaseq_data
# 
# 
# ###############
# ##### mutations
# ###############
# 
# # Example directory in which untarred data will be stored
# dir.create('data3')
# 
# 
# downloadTCGA(cancerTypes = 'OV', 
#              dataSet = 'Mutation_Packager_Calls.Level',
#              destDir = 'data3')
# 
# # reading data
# list.files('data3', recursive = TRUE) -> directory
# 
# readTCGA(directory, 'mutations') -> mut_file
# 
# #################
# ##### methylation
# #################
# 
# # Example directory in which untarred data will be stored
# dir.create('data4')
# 
# # Download KIRP methylation data and store it in data4 folder
# cancerType = "KIRP"
# downloadTCGA(cancerTypes = cancerType,
#              dataSet = "Merge_methylation__humanmethylation27",
#              destDir = "data4")
# 
# # Shorten path of subdirectory with KIRP methylation data
# list.files(path = "data4", full.names = TRUE) %>%
#     file.rename(to = file.path("data4", paste0(cancerType, ".methylation")))
# 
# # Remove manifest.txt file
# list.files(path = "data4", full.names = TRUE, 
#            recursive = TRUE, pattern = "MANIFEST") %>%
#            file.remove()
# 
# # Read KIRP methylation data
# path <- list.files(path = "data4", full.names = TRUE, recursive = TRUE)
# KIRP.methylation <- readTCGA(path, dataType = "methylation")
# 
# 
# ##########
# ##### RPPA
# ##########
# 
# # Directory in which untarred data will be stored
# dir.create('data5')
# 
# # Download BRCA RPPA data and store it in data5 folder
# cancerType = "BRCA"
# downloadTCGA(cancerTypes = cancerType,
#              dataSet = "protein_normalization__data.Level_3",
#              destDir = "data5")
# 
# # Shorten path of subdirectory with BRCA RPPA data
# list.files(path = "data5", full.names = TRUE) %>%
#     file.rename(from = ., to = file.path("data5", paste0(cancerType, ".RPPA")))
# 
# # Remove manifest.txt file
# list.files(path = "data5", full.names = TRUE,
#            recursive = TRUE, pattern = "MANIFEST") %>%
#            file.remove()
# 
# # Read BRCA RPPA data
# path <- list.files(path = "data5", full.names = TRUE, recursive = TRUE) 
# BRCA.RPPA <- readTCGA(path, dataType = "RPPA")
# 
# 
# ##########
# ##### mRNA
# ##########
# 
# # Directory in which untarred data will be stored
# dir.create('data6')
# 
# # Download UCEC mRNA data and store it in data6 folder
# cancerType = "UCEC"
# downloadTCGA(cancerTypes = cancerType,
#              dataSet = "agilentg4502a_07_3__unc_edu__Level_3",
#              destDir = "data6")
# 
# # Shorten path of subdirectory with UCEC mRNA data
# list.files(path = "data6", full.names = TRUE) %>%
#     file.rename(from = ., to = file.path("data6",paste0(cancerType, ".mRNA")))
# 
# # Remove manifest.txt file
# list.files(path = "data6", full.names = TRUE,
#            recursive = TRUE, pattern = "MANIFEST") %>%
#            file.remove()
# 
# # Read UCEC mRNA data
# path <- list.files(path = "data6", full.names = TRUE, recursive = TRUE) 
# UCEC.mRNA <- readTCGA(path, dataType = "mRNA")
# 
# ##############
# ##### miRNASeq
# ##############
# 
# # Directory in which untarred data will be stored
# dir.create('data7')
# 
# # Download BRCA miRNASeq data and store it in data7 folder
# # Remember that miRNASeq data are produced by two machines:
# # Illumina Genome Analyzer and Illumina HiSeq 2000 machines
# cancerType <- "BRCA"
# downloadTCGA(cancerTypes = cancerType,
# dataSet = paste0("Merge_mirnaseq__illuminaga_mirnaseq__bcgsc",
#                 "_ca__Level_3__miR_gene_expression__data.Level_3"),
#              destDir = "data7")
# 
# downloadTCGA(cancerTypes = cancerType,
# dataSet = paste0("Merge_mirnaseq__illuminahiseq_mirnaseq__",
#                  "bcgsc_ca__Level_3__miR_gene_expression__data.Level_3"),
#              destDir = "data7")
# 
# # Shorten path of subdirectory with BRCA miRNASeq data
# list.files(path = "data7", full.names = TRUE) %>%
#     sapply(function(path){
#         if (grepl(pattern = "illuminaga", path)){
#             file.rename(from = grep(pattern = "illuminaga", path, value = TRUE),
#                         to = file.path("data7",paste0(cancerType, ".miRNASeq.illuminaga")))
#         } else if (grepl(pattern = "illuminahiseq", path)){
#             file.rename(from = grep(pattern = "illuminahiseq", path, value = TRUE),
#                         to = file.path("data7",paste0(cancerType, ".miRNASeq.illuminahiseq")))
#         }
#     })
#     
# # Remove manifest.txt file
# list.files(path = "data6", full.names = TRUE,
#            recursive = TRUE, pattern = "MANIFEST") %>%
#            file.remove()
# 
# # Read BRCA miRNASeq data
# path <- list.files(path = "data7", full.names = TRUE, recursive = TRUE)
# path_illuminaga <- grep("illuminaga", path, fixed = TRUE, value = TRUE)
# path_illuminahiseq <- grep("illuminahiseq", path, fixed = TRUE, value = TRUE)
# 
# BRCA.miRNASeq.illuminaga <- readTCGA(path_illuminaga, dataType = "miRNASeq")
# BRCA.miRNASeq.illuminahiseq <- readTCGA(path_illuminahiseq, dataType = "miRNASeq")
# 
# BRCA.miRNASeq.illuminaga <- cbind(machine = "Illumina Genome Analyzer",
#                                   BRCA.miRNASeq.illuminaga)
# BRCA.miRNASeq.illuminahiseq <- cbind(machine = "Illumina HiSeq 2000",
#                                      BRCA.miRNASeq.illuminahiseq)
# 
# BRCA.miRNASeq <- rbind(BRCA.miRNASeq.illuminaga, BRCA.miRNASeq.illuminahiseq)
# 
# ##############
# ##### isoforms
# ##############
# 
# # Directory in which untarred data will be stored
# dir.create('data8')
# 
# # Download ACC isoforms data and store it in data8 folder
# cancerType = "ACC"
# downloadTCGA(cancerTypes = cancerType,
# dataSet = paste0("Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc",
#                  "_edu__Level_3__RSEM_isoforms_normalized__data.Level_3"),
#              destDir = "data8")
# 
# # Shorten path of subdirectory with ACC isoforms data
# list.files(path = "data8", full.names = TRUE) %>%
#     file.rename(from = ., to = file.path("data8",paste0(cancerType, ".isoforms")))
# 
# # Remove manifest.txt file
# list.files(path = "data6", full.names = TRUE,
#            recursive = TRUE, pattern = "MANIFEST") %>%
#            file.remove()
# 
# # Read ACC isoforms data
# path <- list.files(path = "data8", full.names = TRUE, recursive = TRUE) 
# ACC.isoforms <- readTCGA(path, dataType = "isoforms")
# 
## ---------------------------------------------

Read TCGA data to the tidy Format

Arguments

Value

Details

Issues

See also

Examples

Contents

Author