+ All Categories
Home > Documents > 1. Data Preprocessing - University of California, Los Angeles · PDF file1. Data Preprocessing...

1. Data Preprocessing - University of California, Los Angeles · PDF file1. Data Preprocessing...

Date post: 23-Feb-2018
Category:
Upload: hoangkhanh
View: 216 times
Download: 0 times
Share this document with a friend
20
1. Data Preprocessing library(lumi); library(limma); library(gplots); library(WGCNA) #---------------Read raw data-------------------------------------------------------------------------- filename <-"GSE28521_non-normalized_data 16-36-41.txt" data.lumi<-lumiR(filename) sampleKey=read.csv("sampleKeyComplete.csv") #---------------QCPlots of non-normalized data---------------------------------------------------- matchSN=match(sampleNames(data.lumi), sampleKey$GEO_SampleName) cortex=which((sampleKey$Brain.area[matchSN]=="F")|( sampleKey$Brain.area[matchSN]=="T")) vermis=which(sampleKey$Brain.area[matchSN]=="C") colChip=labels2colors(as.numeric(substr(sampleKey$Chip[matchSN],start=9,stop=10))); colArea=as.character(sampleKey$Brain.area[matchSN]) colPhenotype= as.character(sampleKey$A.C[matchSN]) colArea[which(colArea[]=="F")]="red"; colArea[which(colArea[]=="T")]="black"; colArea[which(colArea[]=="C")]="green" colPhenotype[which(colPhenotype []=="A")]="red"; colPhenotype [which(colPhenotype []=="C")]="blue"; #Use the last 4 characters of sample names as sample labels for plots sampleNames(data.lumi)=substr(sampleNames(data.lumi), start=9, stop=12) pdf("QCplots_beforeNormalization1.pdf", height=20, width=15) plot(data.lumi, what="density") plot(data.lumi, what="cv") dev.off() pdf("QCplots_beforeNormalization2.pdf", height=10, width=15) plot(data.lumi, what="boxplot", col=colChip) plot(data.lumi, what="sampleRelation", method="cluster") plot(data.lumi, what="sampleRelation", method="mds", col=colArea, sub="red-frontal cortex, black- temporal cortex, green-cerebellum",) plot(data.lumi, what="sampleRelation", method="mds", col=colPhenotype, sub="red-autism, blue- control",) dev.off() #---------------------------log2 transform and Normalize-------------------------------------------------------- # log2 transform and create separate datasets for: all samples, cortex, cerebellum dataAll.lumi<-lumiT(data.lumi, method="log2"); dataCTX.lumi<-dataAll.lumi[,cortex]; dataVER.lumi<-dataAll.lumi[,vermis]
Transcript

1. Data Preprocessing library(lumi); library(limma); library(gplots); library(WGCNA) #---------------Read raw data-------------------------------------------------------------------------- filename <-"GSE28521_non-normalized_data 16-36-41.txt" data.lumi<-lumiR(filename) sampleKey=read.csv("sampleKeyComplete.csv") #---------------QCPlots of non-normalized data---------------------------------------------------- matchSN=match(sampleNames(data.lumi), sampleKey$GEO_SampleName) cortex=which((sampleKey$Brain.area[matchSN]=="F")|( sampleKey$Brain.area[matchSN]=="T")) vermis=which(sampleKey$Brain.area[matchSN]=="C") colChip=labels2colors(as.numeric(substr(sampleKey$Chip[matchSN],start=9,stop=10))); colArea=as.character(sampleKey$Brain.area[matchSN]) colPhenotype= as.character(sampleKey$A.C[matchSN]) colArea[which(colArea[]=="F")]="red"; colArea[which(colArea[]=="T")]="black"; colArea[which(colArea[]=="C")]="green" colPhenotype[which(colPhenotype []=="A")]="red"; colPhenotype [which(colPhenotype []=="C")]="blue"; #Use the last 4 characters of sample names as sample labels for plots sampleNames(data.lumi)=substr(sampleNames(data.lumi), start=9, stop=12) pdf("QCplots_beforeNormalization1.pdf", height=20, width=15) plot(data.lumi, what="density") plot(data.lumi, what="cv") dev.off() pdf("QCplots_beforeNormalization2.pdf", height=10, width=15) plot(data.lumi, what="boxplot", col=colChip) plot(data.lumi, what="sampleRelation", method="cluster") plot(data.lumi, what="sampleRelation", method="mds", col=colArea, sub="red-frontal cortex, black-temporal cortex, green-cerebellum",) plot(data.lumi, what="sampleRelation", method="mds", col=colPhenotype, sub="red-autism, blue-control",) dev.off() #---------------------------log2 transform and Normalize-------------------------------------------------------- # log2 transform and create separate datasets for: all samples, cortex, cerebellum dataAll.lumi<-lumiT(data.lumi, method="log2"); dataCTX.lumi<-dataAll.lumi[,cortex]; dataVER.lumi<-dataAll.lumi[,vermis]

#Normalize dataAll_N.lumi<-lumiN(dataAll.lumi, method="quantile"); dataCTX_N.lumi<- lumiN(dataCTX.lumi, method="quantile"); dataVER_N.lumi<- lumiN(dataVER.lumi, method="quantile"); # extract expression data dataNormAll=exprs(dataAll_N.lumi) dataNormCTX=exprs(dataCTX_N.lumi) dataNormVER=exprs(dataVER_N.lumi) #-------------------------------------QCPlots after normalization------------------------------------------- pdf("QCplots_AfterNorm All samples.pdf", height=10, width=15) plot(dataAll_N.lumi, what="boxplot", col=colChip) plot(dataAll_N.lumi, what="sampleRelation", method="mds", col=colArea, sub="red-frontal cortex, black-temporal cortex, green-cerebellum",) plot(dataAll_N.lumi, what="sampleRelation", method="mds", col=colPhenotype, sub="red-autism, blue-control",) IACa<-cor(dataNormAll, use="p") heatmap.2(IACa, col=cm.colors(300), key=TRUE, cexRow=1.2,cexCol=1.2, hline=0, vline=0, tracecol=0, ColSideColors=colPhenotype, RowSideColors=colArea, dendrogram="row", main="Inter-array correlation, All samples", xlab=" Side bar: red-frontal cortex, black-temporal cortex, green-cerebellum, Top bar: red-autism, blue-control") dev.off() pdf("QCplots_AfterNorm Cortex samples.pdf", height=10, width=15) plot(dataCTX_N.lumi, what="boxplot", col=colChip[cortex]) plot(dataCTX_N.lumi, what="sampleRelation", method="mds", col=colArea[cortex], sub="red-frontal cortex, black-temporal cortex",) plot(dataCTX_N.lumi, what="sampleRelation", method="mds", col=colPhenotype[cortex], sub="red-autism, blue-control",) IACc<-cor(dataNormCTX, use="p") heatmap.2(IACc, col=cm.colors(300), key=TRUE, cexRow=1.2,cexCol=1.2, hline=0, vline=0, tracecol=0, ColSideColors=colPhenotype[cortex], RowSideColors=colArea[cortex], dendrogram="row", main="Inter-array correlation, Cortex samples ", xlab=" Side bar: red-frontal cortex, black-temporal cortex, Top bar: red-autism, blue-control") dev.off() pdf("QCplots_AfterNorm Cerebellum samples.pdf", height=10, width=15) plot(dataVER_N.lumi, what="boxplot", col=colChip[vermis]) plot(dataVER_N.lumi, what="sampleRelation", method="mds", col=colPhenotype[vermis],

sub="red-autism, blue-control",) IACv<-cor(dataNormVER, use="p") heatmap.2(IACv, col=cm.colors(300), key=TRUE, cexRow=1.2,cexCol=1.2, hline=0, vline=0, tracecol=0, ColSideColors=colPhenotype[vermis], dendrogram="row", main="Inter-array correlation, Cerebellum samples", xlab=" Top bar: red-autism, blue-control") dev.off() #----------------------Select expressed probes--------------------------------------------------------------- pcountA=detectionCall(dataAll.lumi, Th=0.01) dataNormAllExp=dataNormAll[pcountA>39,] dim(dataNormAllExp) pcountC=detectionCall(dataCTX.lumi, Th=0.01) dataNormCTXExp=dataNormCTX[pcountC>29,] dim(dataNormCTXExp) pcountV=detectionCall(dataVER.lumi, Th=0.01) dataNormVERExp=dataNormVER[pcountV>10,] dim(dataNormVERExp) #----------------Save Normalized Data----------------------------------------------------------------------------- reverseCtx=match(colnames(dataNormCTXExp), substr(sampleKey$GEO_SampleName, start=9,stop=12)) colnames(dataNormCTXExp)=sampleKey$GEO_SampleName[reverseCtx] write.csv(dataNormCTXExp, "dataNormCTXExp.csv") reverseVer=match(colnames(dataNormVERExp), substr(sampleKey$GEO_SampleName, start=9,stop=12)) colnames(dataNormVERExp)=sampleKey$GEO_SampleName[reverseVer] write.csv(dataNormVERExp, "dataNormVERExp.csv")

68

1012

1416

Boxplot of microarray intensity

ampl

itude

12_A

12_B

12_C

12_D

12_F

12_G

12_H

03_A

03_B

03_C

03_D

03_E

03_F

11_A

11_B

11_F

11_G

11_H

02_D

13_A

13_C

13_D

13_E

13_G

13_H

08_A

08_B

08_C

08_G

08_H

10_D

10_G

10_H

05_A

05_B

05_C

05_D

05_E

05_F

05_H

31_A

31_B

31_D

31_E

31_F

31_H

07_A

07_B

07_C

07_D

07_E

07_F

07_G

06_A

06_C

06_D

06_E

06_F

06_G

15_A

15_B

15_D

15_E

15_F

15_G

15_H

04_A

04_B

04_C

04_D

04_E

04_F

04_H

14_A

14_B

14_D

14_E

14_F

14_G

12_G

06_C

03_F

06_F

14_F

12_C

15_F

08_C

07_C

13_H

11_F

07_F

05_F

04_F

03_C

31_F 05

_C04

_C12

_F13

_C08

_G12

_H12

_D05

_A05

_B 08_A

07_A

07_B

04_A

04_B

08_H

10_H

03_B

12_A

12_B

02_D

07_E

13_D

13_E

07_D

03_A

05_D

10_D

04_D

04_E

05_H

04_H

11_H

03_D

31_D

15_D

07_G

06_G 15_E

05_E

14_D

06_D

06_E 03

_E31

_E13

_A08

_B11

_G10

_G15

_G14

_A14

_B15

_A15

_B11

_A15

_H06

_A31

_A31

_B14

_G31

_H14

_E11

_B13

_G

1020

3040

5060

70

Sample relations based on 11879 genes with sd/mean > 0.1

hclust (*, "average")Sample

Hei

ght

−20 0 20 40 60

−40

−20

020

40

Sample relations based on 11879 genes with sd/mean > 0.1

red−frontal cortex, black−temporal cortex, green−cerebellumPrincipal Component 1 (42%)

Prin

cipa

l Com

pone

nt 2

(22.

7%)

12_A

12_B

12_C

12_D

12_F

12_G

12_H

03_A

03_B

03_C

03_D03_E

03_F

11_A

11_B

11_F

11_G

11_H

02_D

13_A13_C

13_D13_E

13_G

13_H

08_A

08_B

08_C

08_G

08_H

10_D

10_G

10_H

05_A

05_B

05_C

05_D

05_E

05_F

05_H

31_A

31_B

31_D31_E

31_F

31_H

07_A07_B

07_C

07_D07_E

07_F

07_G

06_A

06_C

06_D

06_E

06_F

06_G

15_A15_B

15_D15_E

15_F

15_G

15_H

04_A04_B

04_C

04_D04_E

04_F

04_H

14_A

14_B

14_D

14_E

14_F

14_G

−20 0 20 40 60

−40

−20

020

40

Sample relations based on 11879 genes with sd/mean > 0.1

red−autism, blue−controlPrincipal Component 1 (42%)

Prin

cipa

l Com

pone

nt 2

(22.

7%)

12_A

12_B

12_C

12_D

12_F

12_G

12_H

03_A

03_B

03_C

03_D03_E

03_F

11_A

11_B

11_F

11_G

11_H

02_D

13_A13_C

13_D13_E

13_G

13_H

08_A

08_B

08_C

08_G

08_H

10_D

10_G

10_H

05_A

05_B

05_C

05_D

05_E

05_F

05_H

31_A

31_B

31_D31_E

31_F

31_H

07_A07_B

07_C

07_D07_E

07_F

07_G

06_A

06_C

06_D

06_E

06_F

06_G

15_A15_B

15_D15_E

15_F

15_G

15_H

04_A04_B

04_C

04_D04_E

04_F

04_H

14_A

14_B

14_D

14_E

14_F

14_G

6 8 10 12 14 16

0.0

0.2

0.4

0.6

0.8

Density plot of intensity

intensity

dens

ity

12_A12_B12_C12_D12_F12_G12_H03_A03_B03_C03_D03_E03_F11_A11_B11_F11_G11_H02_D13_A13_C13_D13_E13_G13_H08_A08_B08_C08_G08_H10_D10_G10_H05_A05_B05_C05_D05_E05_F05_H31_A31_B31_D31_E31_F31_H07_A07_B07_C07_D07_E07_F07_G06_A06_C06_D06_E06_F06_G15_A15_B15_D15_E15_F15_G15_H04_A04_B04_C04_D04_E04_F04_H14_A14_B14_D14_E14_F14_G

−7 −6 −5 −4 −3 −2

0.0

0.2

0.4

0.6

0.8

1.0

1.2

1.4

Density plot of coefficient of variance

coefficient of variance (log2)

Den

sity

12_A12_B12_C12_D12_F12_G12_H03_A03_B03_C03_D03_E03_F11_A11_B11_F11_G11_H02_D13_A13_C13_D13_E13_G13_H08_A08_B08_C08_G08_H10_D10_G10_H05_A05_B05_C05_D05_E05_F05_H31_A31_B31_D31_E31_F31_H07_A07_B07_C07_D07_E07_F07_G06_A06_C06_D06_E06_F06_G15_A15_B15_D15_E15_F15_G15_H04_A04_B04_C04_D04_E04_F04_H14_A14_B14_D14_E14_F14_G

810

1214

Boxplot of microarray intensity

ampl

itude

12_A

12_B

12_C

12_D

12_F

12_G

12_H

03_A

03_B

03_C

03_D

03_E

03_F

11_A

11_B

11_F

11_G

11_H

02_D

13_A

13_C

13_D

13_E

13_G

13_H

08_A

08_B

08_C

08_G

08_H

10_D

10_G

10_H

05_A

05_B

05_C

05_D

05_E

05_F

05_H

31_A

31_B

31_D

31_E

31_F

31_H

07_A

07_B

07_C

07_D

07_E

07_F

07_G

06_A

06_C

06_D

06_E

06_F

06_G

15_A

15_B

15_D

15_E

15_F

15_G

15_H

04_A

04_B

04_C

04_D

04_E

04_F

04_H

14_A

14_B

14_D

14_E

14_F

14_G

−20 −10 0 10 20 30 40

−20

−10

010

2030

40

Sample relations based on 18822 genes with sd/mean > 0.1

red−frontal cortex, black−temporal cortex, green−cerebellumPrincipal Component 1 (42.8%)

Prin

cipa

l Com

pone

nt 2

(18.

8%)

12_A

12_B

12_C

12_D

12_F

12_G

12_H

03_A

03_B

03_C

03_D03_E

03_F

11_A

11_B

11_F

11_G11_H

02_D

13_A

13_C

13_D13_E

13_G

13_H08_A

08_B

08_C

08_G

08_H

10_D

10_G

10_H

05_A

05_B05_C

05_D05_E05_F

05_H

31_A

31_B

31_D31_E

31_F

31_H

07_A07_B

07_C

07_D07_E

07_F

07_G

06_A

06_C06_D

06_E

06_F

06_G

15_A15_B

15_D15_E

15_F

15_G

15_H

04_A04_B

04_C04_D

04_E 04_F

04_H

14_A

14_B

14_D

14_E

14_F

14_G

−20 −10 0 10 20 30 40

−20

−10

010

2030

40

Sample relations based on 18822 genes with sd/mean > 0.1

red−autism, blue−controlPrincipal Component 1 (42.8%)

Prin

cipa

l Com

pone

nt 2

(18.

8%)

12_A

12_B

12_C

12_D

12_F

12_G

12_H

03_A

03_B

03_C

03_D03_E

03_F

11_A

11_B

11_F

11_G11_H

02_D

13_A

13_C

13_D13_E

13_G

13_H08_A

08_B

08_C

08_G

08_H

10_D

10_G

10_H

05_A

05_B05_C

05_D05_E05_F

05_H

31_A

31_B

31_D31_E

31_F

31_H

07_A07_B

07_C

07_D07_E

07_F

07_G

06_A

06_C06_D

06_E

06_F

06_G

15_A15_B

15_D15_E

15_F

15_G

15_H

04_A04_B

04_C04_D

04_E 04_F

04_H

14_A

14_B

14_D

14_E

14_F

14_G

12_F

13_C

08_G

05_C

05_F

04_F

11_F

04_C

03_C

31_F

12_C

03_F

12_G

06_F

06_C

15_F

14_F

07_C

08_C

13_H

07_F

14_G

31_B

13_G

31_A

13_A

06_A

11_B

14_E

31_H

12_H

04_B

04_A

08_A

07_B

07_A

12_B

08_H

10_H

12_A

07_D

03_B

13_E

13_D

15_H

07_G

06_E

15_G

08_B

10_G

11_G

14_B

14_A

11_A

15_A

15_B

12_D

11_H

06_G

03_E

06_D

15_E

14_D

31_D

15_D

03_D

05_A

05_B

04_H

05_H

02_D

07_E

04_D

04_E

05_D

31_E

05_E

03_A

10_D

Side bar: red−frontal cortex, black−temporal cortex, green−cerebellum, Top bar: red−autism, blue−control

12_F13_C08_G05_C05_F04_F11_F04_C03_C31_F12_C03_F12_G06_F06_C15_F14_F07_C08_C13_H07_F14_G31_B13_G31_A13_A06_A11_B14_E31_H12_H04_B04_A08_A07_B07_A12_B08_H10_H12_A07_D03_B13_E13_D15_H07_G06_E15_G08_B10_G11_G14_B14_A11_A15_A15_B12_D11_H06_G03_E06_D15_E14_D31_D15_D03_D05_A05_B04_H05_H02_D07_E04_D04_E05_D31_E05_E03_A10_D

Inter−array correlation, All samples

0.88 0.9 0.92 0.94 0.96 0.98 1Value

020

4060

80

Color Keyand Histogram

Cou

nt

810

1214

Boxplot of microarray intensity

ampl

itude

12_A

12_B

12_D

12_H

03_A

03_B

03_D

03_E

11_A

11_B

11_G

11_H

02_D

13_A

13_D

13_E

13_G

08_A

08_B

08_H

10_D

10_G

10_H

05_A

05_B

05_D

05_E

05_H

31_A

31_B

31_D

31_E

31_H

07_A

07_B

07_D

07_E

07_G

06_A

06_D

06_E

06_G

15_A

15_B

15_D

15_E

15_G

15_H

04_A

04_B

04_D

04_E

04_H

14_A

14_B

14_D

14_E

14_G

−20 −10 0 10 20 30 40

−10

010

20

Sample relations based on 18149 genes with sd/mean > 0.1

red−frontal cortex, black−temporal cortexPrincipal Component 1 (35.5%)

Prin

cipa

l Com

pone

nt 2

(8.9

%)

12_A12_B

12_D

12_H

03_A03_B

03_D

03_E

11_A

11_B

11_G

11_H

02_D

13_A

13_D

13_E

13_G

08_A

08_B

08_H

10_D

10_G

10_H

05_A

05_B

05_D

05_E

05_H

31_A

31_B31_D

31_E

31_H

07_A07_B07_D

07_E

07_G

06_A

06_D

06_E

06_G

15_A15_B

15_D

15_E

15_G

15_H

04_A04_B

04_D 04_E

04_H

14_A

14_B

14_D

14_E

14_G

−20 −10 0 10 20 30 40

−10

010

20

Sample relations based on 18149 genes with sd/mean > 0.1

red−autism, blue−controlPrincipal Component 1 (35.5%)

Prin

cipa

l Com

pone

nt 2

(8.9

%)

12_A12_B

12_D

12_H

03_A03_B

03_D

03_E

11_A

11_B

11_G

11_H

02_D

13_A

13_D

13_E

13_G

08_A

08_B

08_H

10_D

10_G

10_H

05_A

05_B

05_D

05_E

05_H

31_A

31_B31_D

31_E

31_H

07_A07_B07_D

07_E

07_G

06_A

06_D

06_E

06_G

15_A15_B

15_D

15_E

15_G

15_H

04_A04_B

04_D 04_E

04_H

14_A

14_B

14_D

14_E

14_G

14_G

31_B

13_G

13_A

06_A

11_B

31_A

14_E

31_H

12_H

04_B

04_A

12_B

08_H

10_H

03_B

12_A

13_E

13_D

08_A

07_B

07_A

02_D

07_D

07_E

04_D

04_E

03_A

10_D

15_G

08_B

10_G

15_H

14_B

14_A

11_A

15_B

15_A

12_D

11_G

11_H

07_G

06_G

06_E

03_D

03_E

06_D

15_E

14_D

05_E

31_E

05_D

04_H

05_H

31_D

15_D

05_A

05_B

Side bar: red−frontal cortex, black−temporal cortex, Top bar: red−autism, blue−control

14_G31_B13_G13_A06_A11_B31_A14_E31_H12_H04_B04_A12_B08_H10_H03_B12_A13_E13_D08_A07_B07_A02_D07_D07_E04_D04_E03_A10_D15_G08_B10_G15_H14_B14_A11_A15_B15_A12_D11_G11_H07_G06_G06_E03_D03_E06_D15_E14_D05_E31_E05_D04_H05_H31_D15_D05_A05_B

Inter−array correlation, Cortex samples

0.92 0.94 0.96 0.98 1Value

010

2030

4050

60

Color Keyand Histogram

Cou

nt

810

1214

Boxplot of microarray intensity

ampl

itude

12_C

12_F

12_G

03_C

03_F

11_F

13_C

13_H

08_C

08_G

05_C

05_F

31_F

07_C

07_F

06_C

06_F

15_F

04_C

04_F

14_F

−40 −30 −20 −10 0 10

−15

−10

−50

510

1520

Sample relations based on 17337 genes with sd/mean > 0.1

red−autism, blue−controlPrincipal Component 1 (40.7%)

Prin

cipa

l Com

pone

nt 2

(10.

5%)

12_C

12_F

12_G

03_C

03_F

11_F13_C

13_H

08_C

08_G

05_C

05_F 31_F

07_C

07_F06_C

06_F15_F

04_C

04_F

14_F

13_C

12_F

08_G

06_C

12_G

06_F

03_F

08_C

07_C

12_C

04_C

03_C

07_F

31_F

15_F

14_F

05_F

05_C

13_H

04_F

11_F

Top bar: red−autism, blue−control

13_C

12_F

08_G

06_C

12_G

06_F

03_F

08_C

07_C

12_C

04_C

03_C

07_F

31_F

15_F

14_F

05_F

05_C

13_H

04_F

11_F

Inter−array correlation, Cerebellum samples

0.94 0.96 0.98 1Value

05

1015

20

Color Keyand Histogram

Cou

nt


Recommended