DNA甲基化作為表觀遺傳調(diào)控的重要機(jī)制,在腫瘤發(fā)生、發(fā)展及預(yù)后中發(fā)揮關(guān)鍵作用。TCGA(The Cancer Genome Atlas)數(shù)據(jù)庫作為全球最大的腫瘤數(shù)據(jù)庫,包含了全基因組甲基化芯片數(shù)據(jù),主要涵蓋Illumina 450K(HM450)、850K(EPIC)及最新935K(EPICv2)甲基化芯片數(shù)據(jù),不同平臺在覆蓋范圍、檢測精度及臨床應(yīng)用上各有優(yōu)勢,為癌癥表觀遺傳學(xué)研究提供了寶貴資源;赥CGA甲基化數(shù)據(jù),系統(tǒng)分析不同芯片平臺(450K/850K/935K)的分子表型特征,能夠獲取課題研究思路、篩選目的基因或者驗證關(guān)鍵表觀遺傳標(biāo)志物。在這里,易基因分享開源代碼,旨在為相關(guān)課題的甲基化數(shù)據(jù)分析思路提供一定的參考意見。
1.加載R包 ----------------------------------------------------------------
#DNA甲基化的探針數(shù)據(jù)-必要的R包下載
#BiocManager::install(c("minfi","ChAMPdata","Illumina450ProbeVariants.db","sva","IlluminaHumanMethylation450kmanifest","limma","RPMM","DNAcopy","preprocessCore","impute","marray","wateRmelon","goseq","plyr","GenomicRanges","RefFreeEWAS","qvalue","isva","doParallel","bumphunter","quadprog","shiny","shinythemes","plotly","RColorBrewer","DMRcate","dendextend","IlluminaHumanMethylationEPICmanifest","FEM","matrixStats","missMethyl","combinat"))
rm(list = ls())
options(stringsAsFactors = F)
library(ChAMP)
library(dplyr)
library(tibble)
library(openxlsx)
library(stringr)
library(data.table)
library(impute)
set.seed(123)#設(shè)置隨機(jī)數(shù)種子,保證結(jié)果可重復(fù),自己喜歡的數(shù)字即可
getwd()
2.讀取必要數(shù)據(jù) ----------------------------------------------------------------
clinical_data =read.delim('./00.data/
clinical.tsv')#臨床表型
sample =read.xlsx('./00.data/副本sample.xlsx')#樣本信息
OSCC_21paired =read.csv('./00.data/OSCC_21paired.csv')#探針文件
head(OSCC_21paired)#行為cg島的編號,列為樣本
Promoter_methylation_candidate =read.xlsx('./00.data/副本Promoter_methylation_candidate(1)(1).xlsx')#篩選的基因集(課題感興趣的)
#實驗驗證的有A-B-C,所以需要明確這三個基因的甲基化情況
head(Promoter_methylation_candidate)
human_me =read.csv('./00.data/humanmethylation450_15017482_v1-2 - 副本.csv')#
head(human_me,10)
3.數(shù)據(jù)預(yù)處理 ---------------------------------------------------------------
# 查看數(shù)據(jù)集的大小和列名
dim(sample) #1240 39
colnames(sample)
# 只保留特定列,即 "case_id", "case_submitter_id", "sample_submitter_id", "sample_type"
sample_1 <- sample[, c("case_id", "case_submitter_id", "sample_submitter_id", "sample_type")]
# 統(tǒng)計樣本類型
table(sample_1$sample_type)
# 只保留 "Primary Tumor" 和 "Solid Tissue Normal" 兩種類型
tissue <- c("Primary Tumor", "Solid Tissue Normal")
sample_tissue <- sample_1[sample_1$sample_type %in% tissue, ]
# 統(tǒng)計保留后的樣本類型
table(sample_tissue$sample_type)
# 對樣品進(jìn)行過濾,因為有些腫瘤樣品的取樣多于1次,需要去除
table(sample_tissue$sample_submitter_id)
deleteuniquelines <- function(x) {
stand.col <- x$case_submitter_id
count <- table(stand.col)
if (all(count < 2)) stop("no repeated records")
else {
ind <- sapply(stand.col, function(t) ifelse(count[as.character(t)] > 1, TRUE, FALSE))
return(x[ind,])
}
}
sample_tissue_filter =deleteuniquelines(sample_tissue)
dim(sample_tissue_filter) #815
nt = sample_tissue_filter[sample_tissue_filter$sample_type =="Solid Tissue Normal",]
tt = sample_tissue_filter[sample_tissue_filter$sample_type =="Primary Tumor",]
#對于正常組織,由于都只取了一次樣品,所以不進(jìn)行過濾
#對于腫瘤樣品:只取tt里sample_submitter id編碼最后一位是"A"的樣品,因為B是福爾馬林固定石蠟包埋組織
tt <- tt[substr(tt$sample_submitter_id,16,16) =="A",]
tt <- tt[tt$case_submitter_id %in% nt$case_submitter_id,]
dim(tt) #37
dim(nt) #37
paired_tissue <- rbind(nt,tt)
dim(paired_tissue) #74個配對的 37對
# 讀取"clinical.tsv"文件,提取腫瘤位置信息
tumor_site <- clinical_data
# 取膀胱癌
colnames(tumor_site)
tumor_site$tissue_or_organ_of_origin
# 只保留特定列,即 "case_id", "case_submitter_id", "sample_submitter_id", "sample_type"
tumor_site <- tumor_site[, c("case_id", "case_submitter_id", "tissue_or_organ_of_origin")]
# 統(tǒng)計樣本類型
table(tumor_site$tissue_or_organ_of_origin)
tumor_site[,c('ans','takeout')]=str_split_fixed(tumor_site$tissue_or_organ_of_origin,',',2)
# tumor_site <- tumor_site[tumor_site$ %in% oscc,]
# 取樣位置"取樣位置"和"腫瘤發(fā)生位置",取"腫瘤發(fā)生的位置"
tumor_site <- tumor_site[,c(1,2,4)]
table(tumor_site$ans)#
dim(tumor_site)#824樣品都是在膀胱的
tumor_site_unique = as.data.frame(unique(tumor_site$case_submitter_id))
colnames(tumor_site_unique)= 'case_submitter_id'
merge_info = paired_tissue[paired_tissue$case_submitter_id %in% tumor_site_unique$case_submitter_id,]
dim(merge_info) #74 4 37對
head(me_data)
merge_info$sample_submitter_id =substr(merge_info$sample_submitter_id,1,15)
#輸出過濾文件
write.table(merge_info,'./00.data/paired_sample_clinial_filter.txt',quote = F,row.names = F,col.names = F)
nt_paired = merge_info[merge_info$sample_type =="Solid Tissue Normal",]
tt_paired = merge_info[merge_info$sample_type =="Primary Tumor",]
#讀取上面的數(shù)據(jù)
me_data =OSCC_21paired
rownames(me_data)=me_data[,1]
me_data=me_data[,-1]
# colnames(me_data) <- str_replace(colnames(me_data),".", "-")
nt_paired$sample_submitter_id <- str_replace_all(nt_paired$sample_submitter_id ,"-", ".")
tt_paired$sample_submitter_id <- str_replace_all(nt_paired$sample_submitter_id ,"-", ".")
me_data_nt = me_data[,colnames(me_data) %in% nt_paired$sample_submitter_id]
me_data_tt = me_data[,colnames(me_data) %in% tt_paired$sample_submitter_id]
me_data_paired = cbind(me_data_nt,me_data_tt)
#重新保留
merge_info$sample_submitter_id <- str_replace_all(merge_info$sample_submitter_id ,"-", ".")
write.table(merge_info,'./00.data//paired_sample_clinial_filter.txt',quote = F,row.names = F,col.names = F)
write.csv(me_data_paired,'./00.data//pangguangai_21paired_me_data_412481.csv')
write.csv(me_data,'./00.data//pangguangai_me_data_412481.csv')
write.csv(merge_info,'./00.data//merge_info.csv')
本篇我們重點探討了甲基化數(shù)據(jù)分析的預(yù)處理。在下一期中,我們將深入分析:
- CHAMP對象的生成以及可視化
- 差異基因和功能富集分析