Rearranging and manipulang data - Evolutionary...

Post on 03-Oct-2020

0 views 0 download

transcript

Anintroduc+onto

NoémieBecker&BenediktHoltmann

WinterSemester16/17

Rearrangingandmanipula.ngdata

Courseoutline–Day7

Courseoutline

•  Review–Checkingandcleaningdata

•  Rearrangingandmanipula+ngdata

•  Reshapingdata

•  Combiningdatasets

•  Makingnewvariables

•  SubseIngdata

•  Summarisingdata

Review–Cleaningandcheckingdata

Setworkingdirectoryusingsetwd()setwd("~/Desktop")Importdatausingread.table()andread.csv()func+onsmyData<-read.csv(file="datafile.csv”,

header=TRUE, sep=",", strip.white=TRUE, na.strings="")

Importdata

Review–Cleaningandcheckingdata

Getanoverview

str(datafile)

'data.frame': 769obs.of12variables: $Snail.ID:int1111111111... $Sex:Factorw/4levels"female","male",..:2242222222... $Size:Factorw/2levels"large","small":2222222222... $Feeding:logiFALSEFALSEFALSEFALSEFALSETRUE... $Distance:num0.170.870.220.130.360.840.690.60.850.59... $Depth:num1.661.261.431.461.211.561.621621.961.93...

$Temp:int21211819212120201919…

Review–Cleaningandcheckingdata

Getanoverview

summary(datafile)Snail.ID Sex Size Feeding DistanceMin.:1.00female:384large:383Mode:logicalMin.:0.00001stQu.:4.00male:385small:385FALSE:5031stQu.:0.2800Median:8.50 NA's:1TRUE:266Median:0.5100Mean:8.49 NA's:0 Mean:0.51253rdQu.:12.00 3rdQu.:0.7500Max.:16.00 Max.:1.0000.........Con+nues

Review–Cleaningandcheckingdata

Getanoverview

head(x)tail(x) Snail.IDSex Size FeedingDistanceDepthTemp11 male smallFALSE 0.17 1.66 2121 male smallFALSE 0.87 1.26 2131 male smallFALSE 0.22 1.43 1841 male smallFALSE 0.13 1.46 1951 male smallFALSE 0.36 1.21 2161 male smallTRUE 0.84 1.56 21

Review–Cleaningandcheckingdata

Getanoverview

str() providesanoverviewofanobject

summary() returnsbasicsta+s+calsummaryforvariables

head() returnsthefirstrecordsofanobject

tail() returnsthelastrecordsofanobject

sort() sortsavectororfactorintoascendingordescendingorder

order() takesasetofvectorsasargumentsandsortsrecursivelybyeachvector,breaking+esbylookingatsuccessivevectorsintheargumentlist

Rearrangingandmanipula+ngdata

Built-indata

•  Manypackagescomewithbuilt-indatasets

•  Tosavememory,datasetsarenotloadedun+ltheyarereferencedthefirst+me

•  Thefunc+ondata()willlistallloadedpackagesandtheirbuilt-indatasets

•  Built-indataisusuallyusedforexamplesthatyoucanfindinthehelpfile

Courseoutline–Day7

Courseoutline

•  Review–Checkingandcleaningdata

•  Rearrangingandmanipula.ngdata

•  Reshapingdata

•  Combiningdatasets

•  Makingnewvariables

•  SubseIngdata

•  Summarisingdata

Rearrangingandmanipula+ngdata

Reshapingdata

WewillusedataonfishabundanceFish_survey<-read.csv("Fish_survey.csv",header=TRUE)

Site Month Transect Trout Perch S+ckleback1River1 January 1 10 5 282River1 January 2 0 13 423River1 January 3 8 19 94River2 January 1 3 5 725River2 January 2 2 9 336River2 January 3 15 24 65...…

Rearrangingandmanipula+ngdata

Reshapingdata

WewillusedataonfishabundanceFish_survey<-read.csv("Fish_survey.csv",header=TRUE)

Site Month Transect Trout Perch S.ckleback1River1 January 1 10 5 282River1 January 2 0 13 423River1 January 3 8 19 94River2 January 1 3 5 725River2 January 2 2 9 336River2 January 3 15 24 65...…

Rearrangingandmanipula+ngdata

Reshapingdata

Reshapingdatausingthepackage.dyrlibrary(+dyr)Tomakeonesinglecolumnincludingallthreespeciesyoucanusethefunc+ongather()

Rearrangingandmanipula+ngdata

Reshapingdata

Examplegather()Fish_survey_long<-gather(Fish_survey,Species,Abundance,4:6)

Site Month Transect Species Abundance1River1 January 1 Trout 102River1 January 2 Trout 03River1 January 3 Trout 84River2 January 1 Trout 35River2 January 2 Trout 26River2 January 3 Trout 15…...

Rearrangingandmanipula+ngdata

Reshapingdata

Toconvertthedatabackintoaformatwithseparatecolumnsforeachusethefunc+onspread()Examplespread()Fish_survey_wide<-spread(Fish_survey_long,Species,Abundance)

Rearrangingandmanipula+ngdata

Reshapingdata

Reshapingdatausingthepackagereshape2library(reshape2)Insteadofgather()thereshape2packageusesthefunc+onmelt()Cau.on:Donotconfusethereshape2librarywiththereshapefunc+on!!

Rearrangingandmanipula+ngdata

Reshapingdata

Reshapingdatausingthepackagereshape2Examplemelt()Fish_survey_long<-melt(Fish_survey, id.vars=c("Site","Month","Transect"),

measure.vars=c("Trout","Perch","S+ckleback"), variable.name="Species",value.name="Abundance")

Rearrangingandmanipula+ngdata

Reshapingdata

Reshapingdatausingthepackagereshape2Similarly,insteadofspread()thereshape2packageusesthefunc+ondcast()Exampledcast()Fish_survey_wide<-dcast(Fish_survey_long,

Site+Month+Transect~Species, value.var="Abundance")

Rearrangingandmanipula+ngdata

Combiningdatasets

Tocombinedatasetswewillusethepackagedplyrinstall.packages(dplyr)library(dplyr)

Rearrangingandmanipula+ngdata

Combiningdatasets

Tocombinedatasetswewillusethepackagedplyr

Rearrangingandmanipula+ngdata

Combiningdatasets

TocombinedatasetswewillusethepackagedplyrImportdatasetsFish_survey_long<-read.csv("Fish_survey_long.csv",header=TRUE,stringsAsFactors=FALSE)Water_data<-read.csv("Water_data.csv",header=TRUE,stringsAsFactors=FALSE)GPS_loca+on<-read.csv("GPS_data.csv",header=TRUE,stringsAsFactors=FALSE)

Rearrangingandmanipula+ngdata

Combiningdatasets

Whynotjustusecbind()?•  Datasetsneedtohavethesamenumberofrows•  Rowsneedtobeinthesameorderbecauserowsarematchedby

posi+on

X1 X2A 1B 1A 2B 2

X1 X3A TA FB FB T

X1 X4A 1A 2A 3

Rearrangingandmanipula+ngdata

Combiningdatasets

Wecanjoindatasetsbyusingthecolumnstheyshare:

FishsurveySite

MonthTransectSpecies

Watercharacteris.csSite

MonthWatertemp.O2-content

GPSSite

TransectLa+tudeLongitude

Rearrangingandmanipula+ngdata

Combiningdatasets

Func.onstocombinedatasetsindplyr

les_join(a,b,by="x1") Joinsmatchingrowsfrombtoa

right_join(a,b,by="x1") Joinsmatchingrowsfromatob

inner_join(a,b,by="x1") Returnsallrowsfromawheretherearematching

valuesinb

full_join(a,b,by="x1") Joinsdataandreturnsallrowsandcolumns

semi_join(a,b,by="x1") Allrowsinathathaveamatchinb,keepingjust

columnsfroma.

an+_join(a,b,by="x1") Allrowsinathatdonothaveamatchinb

Rearrangingandmanipula+ngdata

Combiningdatasets

1.Joinwatercharacteris+cstofishabundancedatausinginner_join()Fish_and_Water<-inner_join(Fish_survey_long,Water_data,

by=c("Site","Month"))

Rearrangingandmanipula+ngdata

Combiningdatasets

Checkthenewdataframestr(Fish_and_Water)head(Fish_and_Water)'data.frame':72obs.of7variables:$Site:Factorw/2levels"River1","River2":1112221112...$Month:chr"January""January""January""January"...$Transect:int1231231231...$Species:Factorw/3levels"Perch","S+ckleback",..:3333333333...$Abundance:int1008321527011...$Mean_water_temp:num3.63.63.66.26.26.22.32.32.38...$Mean_O2_content:num12.612.612.61212129.89.89.812.3...

Rearrangingandmanipula+ngdata

Combiningdatasets

2.AddGPSloaca+onstonewFish_and_Waterdatasetusinginner_join()Fish_survey_combined<-inner_join(Fish_and_Water,GPS_loca+on,

by=c("Site","Transect"))Checkifitworked:str(Fish_survey_combined)head(Fish_survey_combined)

Rearrangingandmanipula+ngdata

Addingnewvariables

WewillusedataonbirdbehaviourBird_Behaviour<-read.csv("Bird_Behaviour.csv",header=TRUE,

stringsAsFactors=FALSE)#Getanoverviewstr(Bird_Behaviour)

X1 X2A 1B 1A 2B 2

X1 X2 X3A 1 TB 1 FA 2 TB 2 F

Rearrangingandmanipula+ngdata

Addingnewvariables

Threewaysaddinganewvariable(logofFID)Using$Bird_Behaviour$log_FID<-log(Bird_Behaviour$FID)Using[]-operatorBird_Behaviour[,"log_FID"]<-log(Bird_Behaviour$FID)Usingmutate()fromdplyrpackageBird_Behaviour<-mutate(Bird_Behaviour,log_FID=log(FID))

Rearrangingandmanipula+ngdata

Addingnewvariables

Addinganewvariablehead(Bird_Behaviour)IndSpeciesSexYearFIDDisturbanceFledglingslog_FID1PD1Passer_domes+cusmale 201358 1 1.60943792PD1Passer_domes+cusmale 2014240 4 0.69314723PD1Passer_domes+cusmale 2015830 4 2.07944154PD2Passer_domes+cusfemale 20131035 3 2.30258515PD2Passer_domes+cusfemale 20141015 0 2.30258516PD2Passer_domes+cusfemale 201566 2 1.7917595

Rearrangingandmanipula+ngdata

Addingnewvariables

Splitonecolumnintotwousingseparate()fromdplyrpackageBird_Behaviour<-separate(Bird_Behaviour,Species,

c("Genus","Species"),sep="_",remove=TRUE)

X1 X2A 1_1B 1_2A 2_1B 2_2

X1 X2.1 X2.2A 1 1B 1 2A 2 1B 2 2

Rearrangingandmanipula+ngdata

Addingnewvariables

Splitonecolumnintotwousingseparate()fromdplyrpackagehead(Bird_Behaviour)IndGenusSpeciesSexYearFID ...1 PD1 Passerdomes+cus male 20135 ...2 PD1 Passerdomes+cus male 20142 ...3 PD1 Passerdomes+cus male 20158...4 PD2 Passerdomes+cus female 2013 10 ...5 PD2 Passerdomes+cus female 2014 10 ...6 PD2 Passerdomes+cus female 20156 ...

Rearrangingandmanipula+ngdata

Addingnewvariables

Combinetwocolumnsusingunite()from.dyrpackageBird_Behaviour<-unite(Bird_Behaviour,"Genus_Species", c(Genus,Species),sep="_",remove=TRUE)

X1 X2A 1_1B 1_2A 2_1B 2_2

X1 X2.1 X2.2A 1 1B 1 2A 2 1B 2 2

Rearrangingandmanipula+ngdata

Addingnewvariables

Combinetwocolumnsusingunite()from.dyrpackagehead(Bird_Behaviour)IndGenus_SpeciesSexYearFID ...1 PD1 Passer_domes+cus male 20135 ...2 PD1 Passer_domes+cus male 20142 ...3 PD1 Passer_domes+cus male 20158...4 PD2 Passer_domes+cus female 2013 10 ...5 PD2 Passer_domes+cus female 2014 10 ...6 PD2 Passer_domes+cus female 20156 ...

Rearrangingandmanipula+ngdata

SubseZngdata

SubseZngdata•  Using[]–operator•  Usingsubset()•  SubseIngwithfunc+onsfromdplyrpackage•  slice()•  filter()•  sample_frac()•  sample_n()•  select()

Rearrangingandmanipula+ngdata

SubseZngdata

SubseIngusing[]-operatorExamples:Bird_Behaviour[,1:4]#selectsthefirst4columnsBird_Behaviour[c(2,3),]#selectsrows2and3Bird_Behaviour[1:3,1:4]#selectstherows1to3andcolumns1to4Bird_Behaviour[c(1:3,6),c(1:4,8]#selectstherows1to3and6,andthecolumns1to4and8

Rearrangingandmanipula+ngdata

SubseZngdata

SubseIngusing[]and$operatorExamples:BirdBird_Behaviour[Bird_Behaviour$Sex=="male",]#selectsallrowswithmales

Rearrangingandmanipula+ngdata

SubseZngdata

SubseIngusingsubset()subset(x,subset,select,...)

Argument Descrip.onx Theobjectfromwhichtoextractsubsetsubset Alogicalexpressionthatdescribesthesetofrowsto

returnselect Anexpressionindica+ngwhichcolumnstoreturn

Rearrangingandmanipula+ngdata

SubseZngdata

Examplessubset():subset(Bird_Behaviour,FID<10)#selectsallrowswithFIDsmallerthan10msubset(Bird_Behaviour,FID<10&Sex=="male")#selectsallrowsformaleswithFIDsmallerthan10msubset(Bird_Behaviour,FID>10|FID<15,select=c(Ind,Sex,Year))#selectsallrowsthathaveavalueofFIDgreaterthan10orlessthan15.WekeeponlytheIND,SexandYearcolumn

Rearrangingandmanipula+ngdata

SubseZngdata

ReviewoflogicaloperatorsCheckout?base::Logicand?Comparisontolearnmore

Operator Descrip.on> greaterthan>= greaterthanorequalto< lessthan<= lessthanorequalto== equalto!= notequaltox&y xandyx|y xory

Rearrangingandmanipula+ngdata

SubseIngbyrowsusingslice()andfilter()Examplesslice()andfilter():Bird_Behaviour.sclice<-slice(Bird_Behaviour,3:5)#selectsrows3-5Bird_Behaviour.filter<-filter(Bird_Behaviour,FID<5)#selectsrowsthatmeetcertaincriteria

SubseZngrowsindplyr

Rearrangingandmanipula+ngdata

Takingarandomsampleofrowsusingsample_frac()andsample_n()Examplessample_frac()andsample_n():Bird_Behaviour.50<-sample_frac(Bird_Behaviour,size=0.5,

replace=FALSE)#takesrandomly50%oftherowsBird_Behaviour_50Rows<-sample_n(Bird_Behaviour,50,

replace=FALSE)#takesrandomly50rows

SubseZngrowsindplyr

Rearrangingandmanipula+ngdata

SubseIngbycolumnsusingselect()Examplesselect():Bird_Behaviour_col<-select(Bird_Behaviour,Ind,Sex,Fledglings)#selectsthecolumnsInd,Sex,andFledglingsBird_Behaviour_reduced<-select(Bird_Behaviour,-Disturbance)#excludesthevariabledisturbance

SubseZngcolumnsindplyr

Rearrangingandmanipula+ngdata

Summarisingdata

SummarizingdatawithdplyrGettheoverallmeanforFIDusingsummarise()andmean()summarise(Bird_Behaviour,mean.FID=mean(FID))mean.FID111.82639

Rearrangingandmanipula+ngdata

Summarisingdata

SummarizingdatawithdplyrWecanaddothermeasurementstothis:summarise(Bird_Behaviour,

mean.FID=mean(FID),#meanmin.FID=min(FID),#minimummax.FID=max(FID),#maximummed.FID=median(FID),#mediansd.FID=sd(FID), #standarddevia+onvar.FID=var(FID),#variancen.FID=n())#samplesize

Rearrangingandmanipula+ngdata

Summarisingdata

SummarizingdatawithdplyrWecanaddothermeasurementstothis: mean.FID min.FIDmax.FIDmed.FIDsd.FIDvar.FIDn.FID1 11.826391 30 108.08203665.3193144

Rearrangingandmanipula+ngdata

Summarisingdata

SummarizingdatawithdplyrGetsummariesforeachspeciesBeforeyoucalculatesummaries,youhavetoapplythegroup_by()func+onBird_Behaviour_by_Species<-group_by(Bird_Behaviour,Species)

Rearrangingandmanipula+ngdata

Summarisingdata

SummarizingdatawithdplyrAserweappliedthegroup_by()func+on,wecangetsummariesforeachspeciesSummary.species<-summarise(Bird_Behaviour_by_Species,

mean.FID=mean(FID),#mean min.FID=min(FID),#minimum max.FID=max(FID),#maximum med.FID=median(FID),#median sd.FID=sd(FID), #standarddevia+on var.FID=var(FID),#variance n.FID=n())#samplesize

Rearrangingandmanipula+ngdata

Summarisingdata

Summarizingdatawithdplyras.data.frame(Summary.species) Speciesmean.FIDmin.FIDmax.FIDmed.FIDsd.FIDvar.FIDn.FID1Fringilla_coelebs20.44530216.3139.83482Passer_domes+cus6.1011073.129.71483Passer_montanus8.9412085.6131.5148

Rearrangingandmanipula+ngdata

WhichRfunc.onsdidwelearn?gather() takesmul+plecolumnsandcollapsesthemintokey-

valuepairs

spread() spreadsakey-valuepairacrossmul+plecolumns

melt() reshapeswideformattolongformat

dcast() reshapeslongformattowideformat

inner_join() Joinsdataandreturnsallrowsfromxwheretherearematchingvaluesiny,andallcolumnsfromxandy

separate() separatessinglecolumnintomul+plecolumns

unite() pastesmul+plecolumnsintoone

subset() returnssubsetswhichmeetcertaincondi+ons

slice() selectsrowsbyposi+on

filter() extractsrowsthatmeetlogicalcriteria

sample_frac() randomlyselectsafrac+onofrows

Rearrangingandmanipula+ngdata

WhichRfunc.onsdidwelearn?sample_n() randomlyselectsnrows

select() selectscolumnsbynameorhelperfunc+on

summarise() summarisesmul+plevaluestoasinglevalue

mean() computesthearithme+cmean

min() returnstheminimumoftheinputvalues

max() returnsthemaximumoftheinputvalues

median() computesthemedian

sd() computesthestandarddevia+on

var() computesthevariance

n() returnsthenumberofrows

group_by() takesanexis+ngtableandconvertsitintoagroupedtablewhereopera+onsareperformed"bygroup"

View() invokesaspreadsheet-styledatavieweronamatrix-likeobject