微博数据挖掘脚本流程

一、加载数据到源表

```shell #! /bin/bash

txt_file_path_local=../data/text_data/weiboplus.txt txt_dir_path_hdfs=/tmp/ws/data/

db_name=wangshuai output_table=weiboplus_origin file_path=`echo $txt_file_path_local | awk -F / {print $NF}` hdfs dfs -put -f $txt_file_path_local $txt_dir_path_hdfs hive -e"

use $db_name; load data inpath $txt_dir_path_hdfs$file_path overwrite into table weiboplus_origin; "

```

二、处理json插入product表

```shell #! /bin/bash

db_name=wangshuai input_table=weiboplus_origin output_table=weiboplus_product

hive -e" use $db_name; insert overwrite table $output_table select get_json_object(json,$[0].content) as content from $input_table; "

```

三、生成分词结果表

```shell #! /bin/bash

db_name=wangshuai

input_table=weiboplus_product output_table=weiboplus_seg_result jar_path_hdfs=hdfs:///tmp/ws/data/day01-1.0-SNAPSHOT-jar-with-dependencies.jar class_path=com.atsansan.day24.deal.NlpUDF natureStr_list=n,nr,nr1,nr2,nrj,nrf,ns,nsf,nt,nz,nl,ng,nw hive -e" use $db_name; add jar $jar_path_hdfs; create temporary function seg as $class_path; insert into table $output_table select seg(content,$natureStr_list) from $input_table;

"

```

四、生成倒排表

```shell #! /bin/bash

db_name=wangshuai

input_table=weiboplus_seg_result output_table=weiboplus_seg_wc intput_table2=weibopuls_stopwords hive -e" use $db_name;

insert overwrite table $output_table select word_cnt.word,freq from (select word,count(1) freq from weiboplus_seg_result lateral view explode(split(content,01)) word_table as word where content is not null and length(word)>1 group by word)as word_cnt left join weibopuls_stopwords black on word_cnt.word=black.word where black.word is null order by freq desc; "

```

五、下载数据到本地

```shell #! /bin/bash

db_name=wangshuai down_path_hdfs=/tmp/ws/data/hotwordsplus/ down_path_local=../data/ input_table=weiboplus_seg_wc

hive -e" use $db_name; insert overwrite directory $down_path_hdfs row format delimited fields terminated by select word,freq from ( select word,freq,dense_rank() over(order by freq desc) top from $input_table) word_top where top<=500; " hdfs dfs -get -f $down_path_hdfs $down_path_local

经验分享 程序员 微信小程序 职场和发展