1.下载docker镜像库:https://github.com/big-data-europe/docker-hive.git,并安装它。
2.修改其docker-compose.yml文件,为每个容器增加上映射。
version: "3" services: namenode: image: bde2020/hadoop-namenode:2.0.0-hadoop2.7.4-java8 volumes: - /data/namenode:/hadoop/dfs/name - /data/tools:/tools environment: - CLUSTER_NAME=test env_file: - ./hadoop-hive.env ports: - "50070:50070" datanode: image: bde2020/hadoop-datanode:2.0.0-hadoop2.7.4-java8 volumes: - /data/datanode:/hadoop/dfs/data - /data/tools:/tools env_file: - ./hadoop-hive.env environment: SERVICE_PRECONDITION: "namenode:50070" ports: - "50075:50075" hive-server: image: bde2020/hive:2.3.2-postgresql-metastore volumes: - /data/tools:/tools env_file: - ./hadoop-hive.env environment: HIVE_CORE_CONF_javax_jdo_option_ConnectionURL: "jdbc:postgresql://hive-metastore/metastore" SERVICE_PRECONDITION: "hive-metastore:9083" ports: - "10000:10000" hive-metastore: image: bde2020/hive:2.3.2-postgresql-metastore volumes: - /data/tools:/tools env_file: - ./hadoop-hive.env command: /opt/hive/bin/hive --service metastore environment: SERVICE_PRECONDITION: "namenode:50070 datanode:50075 hive-metastore-postgresql:5432" ports: - "9083:9083" hive-metastore-postgresql: image: bde2020/hive-metastore-postgresql:2.3.0 volumes: - /data/tools:/tools presto-coordinator: image: shawnzhu/prestodb:0.181 volumes: - /data/tools:/tools ports: - "8080:8080" |
2.创建测试文本
1,xiaoming,book-TV-code,beijing:chaoyang-shagnhai:pudong 2,lilei,book-code,nanjing:jiangning-taiwan:taibei 3,lihua,music-book,heilongjiang:haerbin 3,lihua,music-book,heilongjiang2:haerbin2 3,lihua,music-book,heilongjiang3:haerbin3 |
3.启动并连接HIVE服务。
docker-compose up -d docker-compose exec hive-server bash /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 |
4.创建外部表
create external table t2( id int ,name string ,hobby array<string> ,add map<String,string> ) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':' location '/user/t2' |
5.文件上传到上步骤中的目录内。
方法1:在HIVE的beeline终端中采用:
load data local inpath ‘/tools/example.txt’ overwrite into table t2; 删除已经存在的所有文件,然后写入新的文件。
load data local inpath ‘/tools/example.txt’ into table t2; 在目录中加入新的文件【差异在overwrite】。
方法2:用hadoop fs -put的文件上传功能。
hadoop fs -put /tools/example.txt /user/t2 文件名不改变。
hadoop fs -put /tools/example.txt /user/t2/1.txt 文件名为1.txt
6.在HIVE命令行中验证
select * from t2; 上传一次文件,执行一次。 |
7.在hadoop的文件管理器,也可以浏览到新上传的文件。
同一个文件中的记录是会自动作去重处理的。
——————————————-
如果是sequencefile呢?
1.检验sequencefile的内容。
hadoop fs -Dfs.default.name=file:/// -text /tools/mytest.gzip.sf 废弃的
hadoop fs -Dfs.defaultFS=file:/// -text /tools/mytest.txt.sf
实际内容是:
2.建表
create external table sfgz( `idx` string, `userid` string, `flag` string, `count` string, `value` string, `memo` string) partitioned by (dt string) row format delimited fields terminated by ',' stored as sequencefile location '/user/sfgz'; |
3.上传文件
方法一: hadoop fs -mkdir -p /user/sfgz/dt=2010-05-06/ hadoop fs -put /tools/mytest.txt.sf /user/sfgz/dt=2019-05-17 hadoop fs -put /tools/mytest.txt.sf /user/sfgz/dt=2010-05-04 这种方法,还需要人为Reload一下才行,其reload指令是: 方法二: load data local inpath '/tools/mytest.txt.sf' into table sfgz partition(dt='2009-03-01');这种方法是可以直接查询了。 load data local inpath '/tools/mytest.gzip.sf' into table sfgz partition(dt='2000-03-02'); |