方案实现
库表设计
在mysql上创建采集表
CREATE TABLE `music_data` (
`mid` int(11) NOT NULL AUTO_INCREMENT,
`collector` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`coll_time` datetime DEFAULT NULL,
`song_url` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`song_name` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`artist_url` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`artist_name` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`album_url` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`album_name` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
PRIMARY KEY (`mid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
`mid` int(11) NOT NULL AUTO_INCREMENT,
`collector` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`coll_time` datetime DEFAULT NULL,
`song_url` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`song_name` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`artist_url` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`artist_name` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`album_url` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`album_name` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
PRIMARY KEY (`mid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
在mysql上创建收集表
CREATE TABLE `test`.`sum_artist_name` (
`mid` int(11) NOT NULL AUTO_INCREMENT,
`collector` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`coll_time` datetime DEFAULT NULL,
`artist_name` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`acount` int(11) DEFAULT 0,
PRIMARY KEY (`mid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
CREATE TABLE `test`.`wc_song_name` (
`mid` int(11) NOT NULL AUTO_INCREMENT,
`collector` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`coll_time` datetime DEFAULT NULL,
`word` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`acount` int(11) DEFAULT 0,
PRIMARY KEY (`mid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
`mid` int(11) NOT NULL AUTO_INCREMENT,
`collector` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`coll_time` datetime DEFAULT NULL,
`artist_name` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`acount` int(11) DEFAULT 0,
PRIMARY KEY (`mid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
CREATE TABLE `test`.`wc_song_name` (
`mid` int(11) NOT NULL AUTO_INCREMENT,
`collector` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`coll_time` datetime DEFAULT NULL,
`word` varchar(45) COLLATE utf8_unicode_ci DEFAULT NULL,
`acount` int(11) DEFAULT 0,
PRIMARY KEY (`mid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
数据存储
mysql采集到hive
sqoop import --connect "jdbc:mysql://home.hddly.cn:53306/test?useSSL=false&useUnicode=true&characterEncoding=utf-8" --username test --password test --table music_data --where "collector='张三'" --delete-target-dir --target-dir /user/myname/sqoop_hive_music --hive-database myname --hive-import --hive-overwrite --hive-table music_data --hive-drop-import-delims --m 1
数据分析
hive上词频统计
use myname;
drop table wc_song_name;
create table wc_song_name as (select 0 as mid,'张三' as collecter,current_timestamp() as coll_time,word, count(1) as acount from (select explode(split(song_name, ' ')) as word from music_data) w group by word order by word);
drop table sum_artist_name;
create table sum_artist_name as (select 0 as mid,'张三' as collecter,current_timestamp() as coll_time,artist_name,count(1) as acount from music_data group by artist_name order by count(1) desc);
drop table wc_song_name;
create table wc_song_name as (select 0 as mid,'张三' as collecter,current_timestamp() as coll_time,word, count(1) as acount from (select explode(split(song_name, ' ')) as word from music_data) w group by word order by word);
drop table sum_artist_name;
create table sum_artist_name as (select 0 as mid,'张三' as collecter,current_timestamp() as coll_time,artist_name,count(1) as acount from music_data group by artist_name order by count(1) desc);
数据存储
hive转存到mysql
sqoop export --connect "jdbc:mysql://home.hddly.cn:53306/test?useSSL=false&useUnicode=true&characterEncoding=utf-8" --username test --password test --table sum_artist_name --fields-terminated-by '\001' --lines-terminated-by '\n' --export-dir /user/hive/warehouse/myname.db/sum_artist_name;
sqoop export --connect "jdbc:mysql://home.hddly.cn:53306/test?useSSL=false&useUnicode=true&characterEncoding=utf-8" --username test --password test --table wc_song_name --fields-terminated-by '\001' --lines-terminated-by '\n' --export-dir /user/hive/warehouse/myname.db/wc_song_name;
sqoop export --connect "jdbc:mysql://home.hddly.cn:53306/test?useSSL=false&useUnicode=true&characterEncoding=utf-8" --username test --password test --table wc_song_name --fields-terminated-by '\001' --lines-terminated-by '\n' --export-dir /user/hive/warehouse/myname.db/wc_song_name;