微博数据挖掘脚本流程
一、加载数据到源表
```shell #! /bin/bash
txt_file_path_local=../data/text_data/weiboplus.txt txt_dir_path_hdfs=/tmp/ws/data/
db_name=wangshuai output_table=weiboplus_origin file_path=`echo $txt_file_path_local | awk -F / {print $NF}` hdfs dfs -put -f $txt_file_path_local $txt_dir_path_hdfs hive -e"
use $db_name; load data inpath $txt_dir_path_hdfs$file_path overwrite into table weiboplus_origin; "
```
二、处理json插入product表
```shell #! /bin/bash
db_name=wangshuai input_table=weiboplus_origin output_table=weiboplus_product
hive -e" use $db_name; insert overwrite table $output_table select get_json_object(json,$[0].content) as content from $input_table; "
```
三、生成分词结果表
```shell #! /bin/bash
db_name=wangshuai
input_table=weiboplus_product output_table=weiboplus_seg_result jar_path_hdfs=hdfs:///tmp/ws/data/day01-1.0-SNAPSHOT-jar-with-dependencies.jar class_path=com.atsansan.day24.deal.NlpUDF natureStr_list=n,nr,nr1,nr2,nrj,nrf,ns,nsf,nt,nz,nl,ng,nw hive -e" use $db_name; add jar $jar_path_hdfs; create temporary function seg as $class_path; insert into table $output_table select seg(content,$natureStr_list) from $input_table;
"
```
四、生成倒排表
```shell #! /bin/bash
db_name=wangshuai
input_table=weiboplus_seg_result output_table=weiboplus_seg_wc intput_table2=weibopuls_stopwords hive -e" use $db_name;
insert overwrite table $output_table select word_cnt.word,freq from (select word,count(1) freq from weiboplus_seg_result lateral view explode(split(content, 01)) word_table as word where content is not null and length(word)>1 group by word)as word_cnt left join weibopuls_stopwords black on word_cnt.word=black.word where black.word is null order by freq desc; "
```
五、下载数据到本地
```shell #! /bin/bash
db_name=wangshuai down_path_hdfs=/tmp/ws/data/hotwordsplus/ down_path_local=../data/ input_table=weiboplus_seg_wc
hive -e" use $db_name; insert overwrite directory $down_path_hdfs row format delimited fields terminated by select word,freq from ( select word,freq,dense_rank() over(order by freq desc) top from $input_table) word_top where top<=500; " hdfs dfs -get -f $down_path_hdfs $down_path_local