赞
踩
数据样本,下面是处理过的数据,这里我使用的是填充数据,也可以截取数据
很多人私信要这个数据文件,由于工作了,CSDN用的少了,不好意思~
链接:https://pan.baidu.com/s/1vSj7qu4xHCefbQELKsp1xA
提取码:jvbx
python脚本:
import pandas as pd import numpy as np df = pd.read_csv("D:\\fenbushi\\hadoop\\Cars.csv",encoding='gbk') df["厢长"]=0 df["厢宽"]=0 df["厢高"]=0 df["轴数"]=2 df["准牵引质量"]=0 Fuel_type = df["燃料种类"].fillna("汽油") Fuel_type[Fuel_type=='']="汽油" df["燃料种类"]=Fuel_type car_width = df["车宽"].fillna(1620) car_width[car_width=='']=1620 df["车宽"]=car_width car_height = df["车高"].fillna(1915) car_height[car_height=='']=1915 df["车高"]=car_height wheelbase = df["轴距"].fillna(2700) wheelbase[wheelbase=='']=2700 df["轴距"]=wheelbase front_track = df["前轮距"].fillna(1390) front_track[front_track=='']=1390 df["前轮距"]=front_track Tire_specifications = df["轮胎规格"].fillna("175/70R14LT") Tire_specifications[Tire_specifications=='']="175/70R14LT" df["轮胎规格"]=Tire_specifications luntaishu = df['轮胎数'].fillna(4) luntaishu[luntaishu==''] = 4 df['轮胎数'] = luntaishu #print(df["轮胎数"]) Total_mass = df["总质量"].fillna(1850) Total_mass[Total_mass=='']=1850 df["总质量"]=Total_mass Curb_quality = df["整备质量"].fillna(1850) Curb_quality[Curb_quality=='']=1850 df["整备质量"]=Curb_quality #del df["厢长"] #del df["厢宽"] #del df["厢高"] #del df["核定载质量"] Approved_passenger = df["核定载客"].fillna("24-26") Approved_passenger[Approved_passenger=='4月9日']="4|9" Approved_passenger[Approved_passenger=='7月8日']="7|8" Approved_passenger[Approved_passenger=='5月8日']="5|8" Approved_passenger[Approved_passenger=='5月9日']="5|9" Approved_passenger[Approved_passenger=='6月9日']="6|9" Approved_passenger[Approved_passenger=='10月14日']="10|14" Approved_passenger[Approved_passenger=='10月15日']="10|15" Approved_passenger[Approved_passenger=='10月19日']="10|19" Approved_passenger[Approved_passenger=='10月23日']="10|23" df["核定载客"]=Approved_passenger quality = df["核定载质量"].fillna(0) quality[quality==0]=50 quality[quality=='']=50 df["核定载质量"]=quality traction_quality = df["准牵引质量"].fillna(0) traction_quality[traction_quality=='']=0 df["准牵引质量"]=traction_quality #print(df["准牵引质量"]) chassis_Enterprise = df["底盘企业"].fillna(0) chassis_Enterprise[chassis_Enterprise==0]="江铃汽车股份有限公司" chassis_Enterprise[chassis_Enterprise=='']="江铃汽车股份有限公司" df["底盘企业"]=chassis_Enterprise #print(df["底盘企业"]) chassis_brand = df["底盘品牌"].fillna(0) chassis_brand[chassis_brand==0]="航天" chassis_brand[chassis_brand=='']="航天" df["底盘品牌"]=chassis_brand #print(df["底盘品牌"]) chassis_model = df["底盘型号"].fillna(0) chassis_model[chassis_model==0]="GHT6402E" chassis_model[chassis_model=='']="GHT6402E" chassis_model[chassis_model=='-']="GHT6402E" df["底盘型号"]=chassis_model #print(df["底盘型号"]) age = df["年龄"].fillna(1921) age[age=='']=1921 df["年龄"]=age #print(df["年龄"]) sex = df["性别"].fillna(0) sex[sex==0]="男性" sex[sex=='']="男性" df["性别"]=sex print(df["性别"]) df.to_csv("D:\\fenbushi\\hadoop\\Cars2.csv")
该篇内容只完成了以下两个要求:
1.统计山西省2013年每个月的汽车销售数量的比例,结果例如:
2.统计不同品牌的车在每个月的销售量分布,结果例如:
具体实现方法,使用mapreduce实现,或使用HiveQL实现
一:mapreduce实现:
1.统计山西省2013年每个月的汽车销售数量的比例
package com.cxy.car; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Set; public class Car_sale_Bili { // static int all = 0; public static class BiliMapper extends Mapper<LongWritable, Text,Text,LongWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String lines[] = StringUtils.split(line,","); if (null != lines && lines.length > 1&& !lines[0].isEmpty()) { if (lines[0].equals("山西省") && lines[4].equals("2013")) { context.write(new Text(lines[1]), new LongWritable(1)); // all++; } } } } public static class BiliReduce extends Reducer<Text,LongWritable,Text, DoubleWritable> { double all = 0; Map<String,Long> maps = new HashMap<String,Long>(); @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long count = (long)0; for (LongWritable value : values) { count += value.get(); } all += count; maps.put(key.toString(),count); // double bili = count/all; // context.write(key,new DoubleWritable(bili)); } protected void cleanup( org.apache.hadoop.mapreduce.Reducer<Text,LongWritable,Text, DoubleWritable>.Context context ) throws IOException, InterruptedException { Set<String> keySet = maps.keySet(); for (String str : keySet) { long value = maps.get(str); double bili = value/all; context.write(new Text(str),new DoubleWritable(bili)); } } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(Car_sale_Bili.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(BiliMapper.class); job.setReducerClass(BiliReduce.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean flag = job.waitForCompletion(true); System.exit(flag?0:1); } }
对于出现在方法中的cleanup方法,可参考:
大数据 cleanup用法
2.统计不同品牌的车在每个月的销售量分布
package com.cxy.car; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class Car_sale_fenbu { public static class PPMapper extends Mapper<LongWritable, Text,Text, IntWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String lines[] = StringUtils.split(line,","); if (null != lines && lines.length > 7 && lines[1] != null && lines[7] != null) { context.write(new Text(lines[1]+"\t"+lines[7]),new IntWritable(1)); } } } public static class PPReduce extends Reducer<Text,IntWritable,Text,IntWritable> { @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for (IntWritable value:values){ count += value.get(); } context.write(key,new IntWritable(count)); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(Car_sale_fenbu.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(PPMapper.class); job.setReducerClass(PPReduce.class); FileInputFormat.addInputPath(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); job.waitForCompletion(true); } }
执行顺序,打出jar包,把该jar包放在centos/ubuntu下
然后在hadoop中执行如下命令:
hadoop jar jar包路径 含有main方法的全路径类名 输入路径 输出路径
二:hive实现
首先你的集群上安装了hadoop,并且安装了hive,那么一切都好说了,hive安装请移步:
Centos下hive安装,ubuntu同理
首先创建一个表:
create table cars( province string, --省份 month int, --月 city string, --市 county string, --区县 year int, --年 cartype string,--车辆型号 productor string,--制造商 brand string, --品牌 mold string,--车辆类型 owner string,--所有权 nature string, --使用性质 number int,--数量 ftype string,--发动机型号 outv int,--排量 power double, --功率 fuel string,--燃料种类 length int,--车长 width int,--车宽 height int,--车高 xlength int,--厢长 xwidth int,--厢宽 xheight int,--厢高 count int,--轴数 base int,--轴距 front int,--前轮距 norm string,--轮胎规格 tnumber int,--轮胎数 total int,--总质量 curb int,--整备质量 hcurb int,--核定载质量 passenger string,--核定载客 zhcurb int,--准牵引质量 business string,--底盘企业 dtype string,--底盘品牌 fmold string,--底盘型号 fbusiness string,--发动机企业 name string,--车辆名称 age int,--年龄 sex string --性别 ) row format delimited fields terminated by ',' --这里面代表的是用什么分隔,我转成txt后是逗号分割的 location '/cars';
然后导入样本数据(前提虚拟机中有该样本数据):
load data local inpath '/home/hadoop/Cars.txt' into table cars;
1.统计山西省2013年每个月的汽车销售数量的比例
select a.year,a.month,a.counts/b.sums
from
(select year,month,sum(number) counts
from cars
group by year,month
having month is not null
and year is not null)a,
(select year,sum(number) sums
from cars
where year=2013
group by year)b;
结果:
2.统计不同品牌的车在每个月的销售量分布(数量)
select month,brand,sum(number)
from cars
group by brand,month
having brand is not null and brand <> '' --<>代表!=应该没有人不知道把
and month is not null;
结果(只截取了部分):
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。