本文共 3768 字,大约阅读时间需要 12 分钟。
数据科学家所需要具备的知识与技能
Hacking skills Math&StatisticsKnowledge SubstantiveExpertise * 完整的数据分析流程 *数值变量的特征和可视化
集中趋势的测量 均值(mean)、中位数(median)、众数(mode) 分散趋势 值域、方差、标准差、四分位距 一个变量的可视化 –柱状图(histogram)、点图(dot plot) –箱图(box plot)两个变量的关系
–散点图(scatter plot)方向、形状、强度、极端值分类变量的特征和可视化
–频率表(frequency table)、条形图(bar plot) 两个分类变量的关系 –关联表(contingency table)、相对频率表(relative frequencies) –分段条形图、相对频率分段条形图 –马赛克图(mosaicplot) 一个分类变量和一个数值变量的关系 –并排箱图这里写代码片
R的三大绘图系统
–基本绘图系统:图+修饰/添加=执行一系列函数 –Lattice绘图系统:一次成图,特别适用于变量间的交互 –ggplot2绘图系统基本绘图系统(graphics包)
-plot/hist/boxplot/points/lines/text/title/axislibrary(MASS)data(airquality)head(airquality)hist(airquality$Wind)hist(airquality$Wind,xlab="Wind")boxplot(airquality$Wind,xlab="Wind",ylab="Speed(mph)")boxplot(Wind~Month,airquality,xlab="Wind",ylab="Speed(mph)")plot(airquality$Wind,airquality$Temp)with(airquality,plot(Wind,Temp))#with函数下定义数据集title(main="Wind and Temp in NYC")# 修饰,添加标题}with(airquality,plot(Wind,Temp, main="Wind and Temp in NYC", type="n"))#用不同颜色点表示不同月份的风速with(subset(airquality,Month==9),points(Wind,Temp,col="red"))with(subset(airquality,Month==5),points(Wind,Temp,col="blue"))with(subset(airquality,Month%in%c(6,7,8)),points(Wind,Temp,col="black"))#拟合fit<-lm(Temp~Wind,airquality)abline(fit,lwd=2)legend("topright",pch=1,col=c("red","blue","black"),legend=c("Sep","May","Other"))#注释par("bg")par("col")par("mar")#bottom,left,top,rightpar("mfrow")#两幅图放在一行两列的图中par(mfrow=c(1,2))hist(airquality$Wind)hist(airquality$Temp)par("mfcol") #与mfrow类似
Lattice绘图系统
绘图函数:(lattice包) xyplot/bwplot/histogram/stripplot/dotplot/splom/levelplot/comtourplot 格式;xyplot(y~x|f*g,data) panel函数,用于控制每个面板内的绘图 (grid包)library(lattice)library(MASS)data(airquality)xyplot(Temp~Ozone,data=airquality)airquality$Month<-factor(airquality$Month)xyplot(Temp~Ozone|Month,data=airquality, layout=c(5,1))q<-xyplot(Temp~Ozone|Month,data=airquality, layout=c(5,1))print(q)set.seed(1)x<-rnorm(100)f<-rep(0:1,each=50)y<-x+f-f*x+rnorm(100,sd=0.5)f<-factor(f,labels=c("Group1","Group2"))xyplot(y~x|f,layout=c(2,1))xyplot(y~x|f,panel=function(x,y){ panel.xyplot(x,y) panel.abline(v=mean(x),h=mean(y),lty=2) panel.lmline(x,y,col="red")})
ggplot2绘图系统
层(Layer) qplot()、ggplot()library(ggplot2)library(MASS)qplot(Wind,Temp,data=airquality)qplot(Wind,Temp,data=airquality,col=Month)qplot(Wind,Temp,data=airquality,shape=Month)qplot(Wind,Temp,data=airquality,size=Month)qplot(Wind,Temp,data=airquality,col=I("red"), xlab="Wind(mph)",ylab="Temp",main="Wind vs Temp")qplot(Wind,Temp,data=airquality,geom=c("point","smooth"))qplot(Wind,Temp,data=airquality,col=Month,geom=c("point","smooth"))qplot(Wind,Temp,data=airquality,facets=.~Month)qplot(Wind,data=airquality,facets=Month~.)qplot(Wind,data=airquality,fill=Month)qplot(Wind,data=airquality,geom="density")qplot(y=Wind,data=airquality)ggplot(airquality,aes(Wind,Temp))+ geom_point(coloe="steelblue",alpha=0.4,size=5)ggplot(airquality,aes(Wind,Temp))+ geom_point(aes(color=factor(Month)),alpha=0.7,size=1)ggplot(airquality,aes(Wind,Temp))+ geom_point()+ geom_smooth()ggplot(airquality,aes(Wind,Temp))+ stat_smooth(method="lm",se=FALSE,aes(col=factor(Month)))library(RColorBrewer)myColor<-c(brewer.pal(5,"Dark2"))display.brewer.pal(5,"Dark2"))ggplot(airquality,aes(Wind,Temp,col=factor(Month)))+ geom_point() stat_smooth(method="lm",se=FALSE,aes(group=1,col="All"))+ stat_smooth(method="lm",se=FALSE)+ scale_color_manual("Month",values=myColor)
R语言绘图之颜色
grDevice包 –colorRamp()&colorRampPelette() 颜色名字可使用colors()获取RColorBrewer包
–三类调色板 sequentual/diverging(突出极端)/qualitative(适用分类变量)