上传csv或excel到hive建表脚本_hive根据csv创表

作者：小桥流水78 | 2024-08-12 11:50:03
踩
hive根据csv创表
上传csv文件到hive, 自动建表小脚本。
支持解析字段类型
以parquet格式上传
# coding=utf8
"""
requirements:
 tableschema
 sqlalchemy
 pandas
 pyarrow
 pyhive
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from tableschema import Table
from sqlalchemy import create_engine
import pandas as pd
import logging
import numpy as np
import time
import os

logging_format = '%(asctime)s pid:%(process)d %(levelname)s %(module)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=logging_format, filemode='a')

SCHEMA_HIVE_TYPES = {
    "integer": "BIGINT",
    "number": "DOUBLE",
    "string": "STRING",
    "int": "BIGINT"
}


class HiveUploader(object):
    """
    上传csv, 在hive建表
    """

    def __init__(self, source_file, table_name, location, sqlalchemy_uri, if_exits="fail"):
        """
        hive uploader init params
        :param source_file: file path for csv to upload
        :param table_name:  table_name to be created
        :param location: oss or s3 or hdfs
        :param sqlalchemy_uri:
            hive connection uri, e.g.
            "hive://{username}:{password}@{host}:{port}/default"
        :param if_exits: replace, append, or fail, default is "fail"
        """
        self._source_file = source_file
        self._target_file = os.path.join("/tmp", table_name)
        self._table_name = table_name
        self._location = location
        self._engine = create_engine(sqlalchemy_uri, connect_args={"auth": "LDAP"})
        self._if_exits = if_exits
        self._db_name = table_name.split(".")[0]
        self._tbl_name = table_name.split(".")[1]

    def _drop_table(self):
        statement = """drop table if exists %s""" % self._table_name
        self._engine.execute(statement)

    def _check_if_exits(self):
        """
        check if the table_name exists.
        :return:
        """
        sql = """show tables from {} like '{}'""".format(self._db_name, self._tbl_name)
        df = pd.read_sql_query(sql, self._engine)
        exist = not df.empty
        logging.info("table %s exists status: %s" % (table_name, exist))
        return not df.empty

    def _drop_table(self):
        """
        drop table before create
        :return:
        """
        sql = """drop table if exists {}""".format(self._table_name)
        logging.info(sql)
        self._engine.execute(sql)

    def replace(self):
        # 1: drop table
        self._drop_table()
        self.upload()

    def repair_table(self):
        sql = "msck repair table {}".format(self._table_name)
        self._engine.execute(sql)

    def append(self):
        target_file, create_sql = self.prepare_upload()
        HiveUploader.upload_to_oss(target_file, self._location + str(int(time.time())))
        self.repair_table()

    def prepare_upload(self):
        # parse columns
        df = pd.read_csv(source_file)
        table_schema = Table(source_file).infer()
        fields = table_schema.get("fields")
        schema = {}
        columns = []
        for field in fields:
            column_name = field['name']
            col_type = field["type"]
            if column_name not in df.columns:
                continue
            if col_type in SCHEMA_HIVE_TYPES:
                column_type = SCHEMA_HIVE_TYPES[col_type]
            else:
                column_type = "STRING"
            schema[column_name] = column_type
            columns.append(column_name)
        df.columns = columns
        for column_name in columns:
            column_type = schema[column_name]
            try:
                if column_type == "BIGINT":
                    df[column_name].fillna(0, inplace=True)
                    df[column_name] = df[column_name].astype(np.int64)
                elif column_type == "DOUBLE":
                    df[column_name].fillna(0, inplace=True)
                    df[column_name] = df[column_name].astype(np.float64)
                else:
                    df[column_name].fillna('', inplace=True)
                    df[column_name] = df[column_name].astype(str)
            except:
                df[column_name] = df[column_name].astype(str)
                schema[column_name] = "STRING"
        df.tail().to_parquet(self._target_file)
        column_defs = ["`{}` {}".format(c, schema[c]) for c in columns]
        column_definition = ",\n           ".join(column_defs)
        location, file_name = os.path.split(self._location)
        create_sql = """
        create table {} (
           {}
        )

        STORED AS parquet
        location '{}'
        """.format(self._table_name, column_definition, location)
        return self._target_file, create_sql

    @staticmethod
    def upload_to_oss(target_file, oss_path):
        rm = "oss rm " + oss_path
        os.system(rm)
        cp = "oss cp {} {}".format(target_file, oss_path)
        os.system(cp)

    def upload(self):
        """upload to oss and create table"""
        logging.info("start to upload file to oss!")
        target_file, create_sql = self.prepare_upload()
        # upload file to oss
        HiveUploader.upload_to_oss(target_file, self._location)
        # create table
        self._engine.execute(create_sql)

    def run(self):
        # 1: check if table exists
        exists = self._check_if_exits()
        if exists:
            if self._if_exits == 'fail':
                msg = "table_name {} is already exists!".format(self._table_name)
                logging.error(msg)
                raise Exception(msg)
            elif self._if_exits == "append":
                self.append()
            elif self._if_exits == "replace":
                self.replace()
        else:
            self.upload()
        sql = """select * from {} limit 100""".format(self._table_name)
        df = pd.read_sql_query(sql, self._engine)
        print(df)


if __name__ == '__main__':
    hive_uri = "hive://{username}:{password}@1{ip}:{port}/default"
    source_file = "~/Desktop/magic.csv"
    table_name = "test.test_upload_here_test_v1"
    location = "oss://{bucket_name}//{}/{}.parquet".format(table_name, table_name)
    hive_uploader = HiveUploader(source_file, table_name, location, hive_uri, if_exits="fail")
    hive_uploader.run()


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
全栈开发工程师，有工作需要可联系微信： taojian2009
声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/小桥流水78/article/detail/969625