Original link:tecdat.cn/?p=22751

Original source:Tuo End number according to the tribe public number

 

For the data mining of huge bus and subway route information, the general software has two main problems: 1. For text information mining, especially Chinese vocabulary mining, there is a lack of mature tools or software packages, 2. For large amount of data, general software will encounter problems in reading and processing. Even in a month, the route information of some regions will reach more than several hundred meters. Therefore, for this kind of data, ordinary SQL language or MATLAB software are not good enough to deal with either algorithm running or data reading. For this kind of data, we generally use R software to easily realize reading, data mining and visualization processes.

For example, for the following station data:

And the data of nearly 600M inbound and outbound station information. If we want to organize and visualize the inbound and outbound number of the corresponding route every period of time, we can carry out the following steps for analysis:

First we read and preprocess the data

install.packages("dplyr")

library("dplyr")#读取dplyr包用以排序

###对数据读取

data=read.table("E:\\201501一卡通进出站.txt",stringsAsFactors=F)

##对数据列进行命名

colnames(data)=c("逻辑卡号",

"交易日期" ,

"交易时间",

"票种",

"交易代码",

"交易车站",

"上次交易车站")

###对数据进行预处理

for( ii in 20150101:20150131){#每天的数据

data1=data[which(data[,2]==ii),]#筛选出日期为20150101这天的数据

data2=data1[,c(2,3,6,7)]#筛选出"交易日期" ,"交易时间", "交易车站","上次交易车站"的数据

data2#查看数据

data2=data2[order(data2$交易车站),]

line1=data2[substr(data2$交易车站,1,1)=="1",]#1号线

line2=data2[substr(data2$交易车站,1,1)=="2",]#2号线

###筛选出车站为243

bus=unique(data2[,3])####################每个站的数据

for(busi in 1:length(bus)){

index=which(data2[,3]==bus[busi])#筛选出车站为243的数据行号

data3=data2[index,]#获取交易车站为243的数据

###data3=data2[order(data2$交易车站),]#如果不筛选车站,直接按交易车站递增排序

data4=arrange(data3,交易日期,交易时间)#对时间排序,先按年份递增排序,然后按照时间递增排序

###按每十分钟时间分割

for (time in 6:21){

for(i in 1:6){

index=intersect(which(data4[,2]>time*10000+(i-1)*1000),which(data4[,2]<=time*10000+1000*i))

datat=data4[index,]

outnum=length(which(datat[,4]!=0))

innum=length(which(datat[,4]==0))

if(i!=6)cat(file=paste("E:\\",bus[busi],"车站",ii,"日一卡通进出站时间.txt"),append=TRUE,ii,"日",time,"点",i-1,"0分到",i,"0分的出站人数为",outnum," ","进站人数为",innum,"\n")

else cat(file=paste("E:\\",bus[busi],"车站",ii,"日一卡通进出站时间.txt"),append=TRUE,ii,"日",time,"点",i-1,"0分到",time+1,"点0分的出站人数为",outnum," ","进站人数为",innum,"\n")

#cat(file="E:\\243车站一卡通进出站时间.txt",append=TRUE,time,"点",i-1,"0分到",time+1,"点0分的出站人数为",outnum," ","进站人数为",innum,"\n")

}

}

#筛选出出站人数

dataout=data3[which(data3[,4]!=0),]#上次交易车站不为0,为出站人数

datain=data3[which(data3[,4]==0),]

###将数据进行输出

write.table(data4,paste("E:\\",ii,"日 ",bus[busi],"车站一卡通进出站整理.txt"))#将数据整理好输出到指定的目录文件名

}

}

####################################################################################3

################1,2号线##########

data2=data2[order(data2$交易车站),]

line1=data2[substr(data2$交易车站,1,1)=="1",]#1号线

line2=data2[substr(data2$交易车站,1,1)=="2",]#2号线

#########1号线

data4=arrange(line1,交易日期,交易时间)#对时间排序,先按年份递增排序,然后按照时间递增排序

###按每十分钟时间分割

cat(file="E:\\1号线一卡通进出站时间.txt",append=TRUE, " 点", " 分"," 出站人数", " ","进站人数 " ,"\n")

for (time in 6:21){

for(i in 1:6){

index=intersect(which(data4[,2]>time*10000+(i-1)*1000),which(data4[,2]<=time*10000+1000*i))

datat=data4[index,]

outnum=length(which(datat[,4]!=0))

innum=length(which(datat[,4]==0))

if(i!=6)cat(file="E:\\1号线一卡通进出站时间.txt",append=TRUE,time," ",i-1,"0 "," ",outnum," "," ",innum,"\n")#cat(time,"点",i-1,"0分到",i,"0分的出站人数为",outnum," ","进站人数为",innum,"\n")

else cat(file="E:\\1号线一卡通进出站时间.txt",append=TRUE,time," ",i-1,"0 "," ",outnum," "," ",innum,"\n")#cat(time,"点",i-1,"0分到",time+1,"点0分的出站人数为",outnum," ","进站人数为",innum,"\n") #

#cat(file="E:\\20150101日243车站一卡通进出站时间.txt",append=TRUE,time,"点",i-1,"0分到",time+1,"点0分的出站人数为",outnum," ","进站人数为",innum,"\n")

}

}

#筛选出出站人数

dataout=data3[which(data3[,4]!=0),]#上次交易车站不为0,为出站人数

datain=data3[which(data3[,4]==0),]

numout=dim(dataout)[1]#出站人数总和

numin=dim(datain)[1]#进站人数总和

###将数据进行输出

write.table(data4,"E:\\1号线一卡通进出站整理.txt")#将数据整理好输出到指定的目录文件名

########2号线

data4=arrange(line2,交易日期,交易时间)#对时间排序,先按年份递增排序,然后按照时间递增排序

###按每十分钟时间分割

cat(file="E:\\2号线一卡通进出站时间.txt",append=TRUE, " 点", " 分"," 出站人数", " ","进站人数 " ,"\n")

for (time in 6:21){

for(i in 1:6){

index=intersect(which(data4[,2]>time*10000+(i-1)*1000),which(data4[,2]<=time*10000+1000*i))

datat=data4[index,]

outnum=length(which(datat[,4]!=0))

innum=length(which(datat[,4]==0))

if(i!=6)cat(file="E:\\2号线一卡通进出站时间.txt",append=TRUE,time," ",i-1,"0 "," ",outnum," "," ",innum,"\n")#cat(time,"点",i-1,"0分到",i,"0分的出站人数为",outnum," ","进站人数为",innum,"\n")

else cat(file="E:\\2号线一卡通进出站时间.txt",append=TRUE,time," ",i-1,"0 ", " ",outnum," "," ",innum,"\n")#cat(time,"点",i-1,"0分到",time+1,"点0分的出站人数为",outnum," ","进站人数为",innum,"\n") #

#cat(file="E:\\TB related\\Service\\temp\\20150101日243车站一卡通进出站时间.txt",append=TRUE,time,"点",i-1,"0分到",time+1,"点0分的出站人数为",outnum," ","进站人数为",innum,"\n")

}

}

#筛选出出站人数

dataout=data3[which(data3[,4]!=0),]#上次交易车站不为0,为出站人数

datain=data3[which(data3[,4]==0),]

###将数据进行输出

write.table(data4,"E:\\2号线一卡通进出站整理.txt")#将数据整理好输出到指定的目录文件名

#########1,2总和

data4=arrange(line1,交易日期,交易时间)#对时间排序,先按年份递增排序,然后按照时间递增排序

data44=arrange(line2,交易日期,交易时间)#对时间排序,先按年份递增排序,然后按照时间递增排序

cat(file="E:\\1,2号线一卡通进出站时间.txt",append=TRUE, " 点", " 分"," 出站人数", " ","进站人数 " ,"\n")

for (time in 6:21){

for(i in 1:6){

index=intersect(which(data4[,2]>time*10000+(i-1)*1000),which(data4[,2]<=time*10000+1000*i))

index2=intersect(which(data44[,2]>time*10000+(i-1)*1000),which(data44[,2]<=time*10000+1000*i))

datat=data4[index,]

datat1=data44[index2,]

outnum=length(which(datat[,4]!=0))

outnum1=length(which(datat1[,4]!=0))

innum=length(which(datat[,4]==0))

innum1=length(which(datat1[,4]==0))

if(i!=6)cat(file="E:\\1,2号线一卡通进出站时间.txt",append=TRUE,time," ",i-1,"0 "," ",outnum+outnum1," "," ",innum+innum1,"\n")#cat(time,"点",i-1,"0分到",i,"0分的出站人数为",outnum," ","进站人数为",innum,"\n")

else cat(file="E:\\1,2号线一卡通进出站时间.txt",append=TRUE,time," ",i-1,"0 ", " ",outnum+outnum1," "," ",innum+innum1,"\n")#cat(time,"点",i-1,"0分到",time+1,"点0分的出站人数为",outnum," ","进站人数为",innum,"\n") #

#cat(file="E:\\20150101日243车站一卡通进出站时间.txt",append=TRUE,time,"点",i-1,"0分到",time+1,"点0分的出站人数为",outnum," ","进站人数为",innum,"\n")

}

}

}
Copy the code

Through the above process, we can output the sorted data to the corresponding file:

And the visualization process of traffic routes;

The igraph package in R is a real implementation tool for network diagrams of traffic routes:

Ljhdat1 =readLines("E:/ shanghai_1.txt") ljhdat2=readLines("E:/ shanghai_2.txt") ljhdat3=readLines("E:/ shanghai_2.txt") ljhdat3=readLines("E:/ Shanghai_3.txt ") ljhdat4=readLines("E:/ shanghai_4.txt") ljhdat5=readLines("E:/ shanghai_5.txt") bus=" 1:length(ljhdat1)){if(ljhdat1[I]=="")bus=c(bus,ljhdat1[I])#} for(I in 1:length(ljhdat2)){ If (ljhdat2[I]=="")bus=c(bus,ljhdat2[i-1])# If (ljhdat3[I]=="")bus=c(bus,ljhdat3[i-1])# for(I in 1:length(ljhdat4)){ If (ljhdat4[I]=="")bus=c(bus,ljhdat4[i-1])# for(I in 1:length(ljhdat5)){ If (ljhdat5[I]=="")bus=c(ljhdat5[i-1])# Bus = bus route [1] = list (0) # established route information # # # # # # # # # # # # # # # # # # # # # # # route to the site information is got # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # The route [[1]] = unlist (strsplit (bus [1], the split = "")) [1] the route [[1]] = route [[1]] [- which (route [[1]] = =" # ")] delete # # # N =length(route[[1]]) library(igraph) d = data.frame(route[[1]][1:n-1],route[[1]][2:n]# build abut matrix) g = graph.data.frame(d, Directed = TRUE) plot (g) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # all line segmentation site information # # # # # # # # # # # # # # # # # # # # # # # # # # # library (igraph) route1=character(0);Copy the code

Due to the numerous routes in the network graph generated at last, visual parameters can be set to further optimize the view.


Most welcome insight

1. Data side of data job demand

2. Research hot spots of big data journal articles

3. Machine learning boosts accurate sales forecast of fast fashion

4. Machine learning to identify changing stock market conditions — the application of hidden Markov model (HMM)

5. Data listening to the “online events” on the message board of People’s Daily Online

6. Use GAM (Generalized additive Model) in R language for power load time series analysis

7. Interpreting sports decisions with data: Mining the new value of sports Events

8. Take the pulse of taxi data

9. Data introduction of smart door lock “Cutting hands