mapreduce和hive 对山西省汽车销售数据分析_汽车销售数据分析mapreaduce心得体会

作者：小桥流水78 | 2024-06-22 13:50:38

踩

汽车销售数据分析mapreaduce心得体会

数据样本，下面是处理过的数据，这里我使用的是填充数据，也可以截取数据
很多人私信要这个数据文件，由于工作了，CSDN用的少了，不好意思~
链接：https://pan.baidu.com/s/1vSj7qu4xHCefbQELKsp1xA
提取码：jvbx
在这里插入图片描述
python脚本：

import pandas as pd
import numpy as np

df = pd.read_csv("D:\\fenbushi\\hadoop\\Cars.csv",encoding='gbk')
df["厢长"]=0
df["厢宽"]=0
df["厢高"]=0
df["轴数"]=2
df["准牵引质量"]=0

Fuel_type = df["燃料种类"].fillna("汽油")
Fuel_type[Fuel_type=='']="汽油"
df["燃料种类"]=Fuel_type

car_width = df["车宽"].fillna(1620)
car_width[car_width=='']=1620
df["车宽"]=car_width

car_height = df["车高"].fillna(1915)
car_height[car_height=='']=1915
df["车高"]=car_height

wheelbase = df["轴距"].fillna(2700)
wheelbase[wheelbase=='']=2700
df["轴距"]=wheelbase

front_track = df["前轮距"].fillna(1390)
front_track[front_track=='']=1390
df["前轮距"]=front_track

Tire_specifications = df["轮胎规格"].fillna("175/70R14LT")
Tire_specifications[Tire_specifications=='']="175/70R14LT"
df["轮胎规格"]=Tire_specifications

luntaishu = df['轮胎数'].fillna(4)
luntaishu[luntaishu==''] = 4
df['轮胎数'] = luntaishu
#print(df["轮胎数"])

Total_mass = df["总质量"].fillna(1850)
Total_mass[Total_mass=='']=1850
df["总质量"]=Total_mass

Curb_quality = df["整备质量"].fillna(1850)
Curb_quality[Curb_quality=='']=1850
df["整备质量"]=Curb_quality

#del df["厢长"]
#del df["厢宽"]
#del df["厢高"]
#del df["核定载质量"]
Approved_passenger = df["核定载客"].fillna("24-26")
Approved_passenger[Approved_passenger=='4月9日']="4|9"
Approved_passenger[Approved_passenger=='7月8日']="7|8"
Approved_passenger[Approved_passenger=='5月8日']="5|8"
Approved_passenger[Approved_passenger=='5月9日']="5|9"
Approved_passenger[Approved_passenger=='6月9日']="6|9"
Approved_passenger[Approved_passenger=='10月14日']="10|14"
Approved_passenger[Approved_passenger=='10月15日']="10|15"
Approved_passenger[Approved_passenger=='10月19日']="10|19"
Approved_passenger[Approved_passenger=='10月23日']="10|23"
df["核定载客"]=Approved_passenger

quality = df["核定载质量"].fillna(0)
quality[quality==0]=50
quality[quality=='']=50
df["核定载质量"]=quality

traction_quality = df["准牵引质量"].fillna(0)
traction_quality[traction_quality=='']=0
df["准牵引质量"]=traction_quality

#print(df["准牵引质量"])

chassis_Enterprise = df["底盘企业"].fillna(0)
chassis_Enterprise[chassis_Enterprise==0]="江铃汽车股份有限公司"
chassis_Enterprise[chassis_Enterprise=='']="江铃汽车股份有限公司"

df["底盘企业"]=chassis_Enterprise
#print(df["底盘企业"])

chassis_brand = df["底盘品牌"].fillna(0)
chassis_brand[chassis_brand==0]="航天"
chassis_brand[chassis_brand=='']="航天"
df["底盘品牌"]=chassis_brand
#print(df["底盘品牌"])

chassis_model = df["底盘型号"].fillna(0)
chassis_model[chassis_model==0]="GHT6402E"
chassis_model[chassis_model=='']="GHT6402E"
chassis_model[chassis_model=='-']="GHT6402E"
df["底盘型号"]=chassis_model
#print(df["底盘型号"])

age = df["年龄"].fillna(1921)
age[age=='']=1921
df["年龄"]=age
#print(df["年龄"])

sex = df["性别"].fillna(0)
sex[sex==0]="男性"
sex[sex=='']="男性"
df["性别"]=sex
print(df["性别"])


df.to_csv("D:\\fenbushi\\hadoop\\Cars2.csv")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

该篇内容只完成了以下两个要求：
1.统计山西省2013年每个月的汽车销售数量的比例，结果例如：
在这里插入图片描述
2.统计不同品牌的车在每个月的销售量分布，结果例如：

具体实现方法，使用mapreduce实现，或使用HiveQL实现

一：mapreduce实现：
1.统计山西省2013年每个月的汽车销售数量的比例

package com.cxy.car;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

public class Car_sale_Bili {
    //    static int all = 0;
    public static class BiliMapper extends Mapper<LongWritable, Text,Text,LongWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        	String line = value.toString();
    		String lines[] = StringUtils.split(line,",");
            if (null != lines && lines.length > 1&& !lines[0].isEmpty()) {
                if (lines[0].equals("山西省") && lines[4].equals("2013")) {
                    context.write(new Text(lines[1]), new LongWritable(1));
//                        all++;
                }
            }
        }
    }

    public static class BiliReduce extends Reducer<Text,LongWritable,Text, DoubleWritable> {
        double all = 0;
        Map<String,Long> maps = new HashMap<String,Long>();
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws
                IOException, InterruptedException {
            long count = (long)0;
            for (LongWritable value : values) {
                count += value.get();
            }
            all += count;
            maps.put(key.toString(),count);
//        double bili = count/all;
//        context.write(key,new DoubleWritable(bili));
        }

        protected void cleanup(
                org.apache.hadoop.mapreduce.Reducer<Text,LongWritable,Text, DoubleWritable>.Context context
        ) throws IOException, InterruptedException {
            Set<String> keySet = maps.keySet();
            for (String str : keySet) {
                long value = maps.get(str);
                double bili = value/all;
                context.write(new Text(str),new DoubleWritable(bili));
            }
        }
    }


    	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            job.setJarByClass(Car_sale_Bili.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(LongWritable.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(DoubleWritable.class);

            job.setMapperClass(BiliMapper.class);
            job.setReducerClass(BiliReduce.class);

            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));

            boolean flag = job.waitForCompletion(true);
            System.exit(flag?0:1);
        }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

对于出现在方法中的cleanup方法，可参考：
大数据 cleanup用法
2.统计不同品牌的车在每个月的销售量分布

package com.cxy.car;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Car_sale_fenbu {
public static class PPMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    	String line = value.toString();
		String lines[] = StringUtils.split(line,",");
        if (null != lines && lines.length > 7 && lines[1] != null && lines[7] != null) {
            context.write(new Text(lines[1]+"\t"+lines[7]),new IntWritable(1));
        }
    }
}

public static class PPReduce extends Reducer<Text,IntWritable,Text,IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context)
            throws IOException, InterruptedException {
        int count = 0;
        for (IntWritable value:values){
            count += value.get();
        }
        context.write(key,new IntWritable(count));
    }
}

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(Car_sale_fenbu.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setMapperClass(PPMapper.class);
        job.setReducerClass(PPReduce.class);

        FileInputFormat.addInputPath(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        job.waitForCompletion(true);
    }
}


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

执行顺序，打出jar包，把该jar包放在centos/ubuntu下

然后在hadoop中执行如下命令:

hadoop jar  jar包路径 含有main方法的全路径类名 输入路径   输出路径
1

二：hive实现
首先你的集群上安装了hadoop，并且安装了hive，那么一切都好说了，hive安装请移步：
Centos下hive安装，ubuntu同理
首先创建一个表：

create table cars(
province string, --省份
month int, --月
city string, --市 
county string, --区县
year int, --年
cartype string,--车辆型号
productor string,--制造商
brand string, --品牌
mold string,--车辆类型
owner string,--所有权
nature string, --使用性质
number int,--数量
ftype string,--发动机型号
outv int,--排量
power double, --功率
fuel string,--燃料种类
length int,--车长
width int,--车宽
height int,--车高
xlength int,--厢长
xwidth int,--厢宽
xheight int,--厢高
count int,--轴数
base int,--轴距
front int,--前轮距
norm string,--轮胎规格
tnumber int,--轮胎数
total int,--总质量
curb int,--整备质量
hcurb int,--核定载质量
passenger string,--核定载客
zhcurb int,--准牵引质量
business string,--底盘企业
dtype string,--底盘品牌
fmold string,--底盘型号
fbusiness string,--发动机企业
name string,--车辆名称
age int,--年龄
sex string --性别
)
row format delimited
fields terminated by ','	--这里面代表的是用什么分隔,我转成txt后是逗号分割的
location '/cars';

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

然后导入样本数据（前提虚拟机中有该样本数据）：

load data local inpath '/home/hadoop/Cars.txt' into table cars;
1

1.统计山西省2013年每个月的汽车销售数量的比例

select a.year,a.month,a.counts/b.sums
from
(select year,month,sum(number) counts
from cars
group by year,month
having month is not null
and year is not null)a,
(select year,sum(number)  sums
from cars
where year=2013
group by year)b;
1
2
3
4
5
6
7
8
9
10
11

结果：
在这里插入图片描述

2.统计不同品牌的车在每个月的销售量分布(数量)

select month,brand,sum(number)
from cars
group by brand,month
having brand is not null and brand <> '' --<>代表!=应该没有人不知道把
and month is not null;
1
2
3
4
5

结果（只截取了部分）：
在这里插入图片描述

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/小桥流水78/article/detail/746686