互联网日志分析

开启单节点集群环境

1.修改云主机host文件，添加内网IP，对应映射名为hadoop000，实现云主机自身使用root用户ssh访问hadoop000免密登陆
操作环境：
Hadoop/Hive/Spark

echo "172.18.0.248 hadoop000" >> /etc/hosts

ssh-keygen -t rsa

ssh-copy-id -i /root/.ssh/id_rsa.pub root@hadoop000

2.格式化HDFS文件系统
操作环境：
Hadoop/Hive/Spark

hdfs namenode -format

3.启动Hadoop集群
操作环境：
Hadoop/Hive/Spark

start-all.sh

互联网日志分析

1.将本地数据/root/internetlogs/journal.log上传至HDFS文件系统/input/下，注意自行创建目录。
操作环境：Hadoop/Hive/Spark

hdfs dfs -mkdir /input
hdfs dfs -put /root/internetlogs/journal.log /input/

2.编写程序进行页面访问量统计，结果保存至本地/root/internetlogs/pv/目录下part-00000文件中
操作环境：Hadoop/Hive/Spark

public class PageViewCount extends Configured implements Tool {
    public static class PVMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split(" ");
            String page = fields[6];
            context.write(new Text(page), new IntWritable(1));
        }
    }

    public static class PVReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(getConf());
        job.setJarByClass(PageViewCount.class);
        job.setMapperClass(PVMapper.class);
        job.setReducerClass(PVReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path("/input"));
        FileOutputFormat.setOutputPath(job, new Path("/output/pv"));
        return job.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new PageViewCount(), args);
        System.exit(exitCode);
    }
}

运行MapReduce程序，从HDFS下载结果文件到本地：

hadoop jar PageViewCount.jar /input /output/pv

hdfs dfs -get /output/pv /root/internetlogs/pv

3.编写程序进行页面独立IP的访问量统计，结果保存至本地/root/internetlogs/ip/目录下part-00000文件中，例如1.80.249.223 1表示此IP访问量为1
操作环境：Hadoop/Hive/Spark

public class UniqueIPCount extends Configured implements Tool {
    public static class IPMapper extends Mapper<LongWritable, Text, Text, Text> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split(" ");
            String page = fields[6];
            String ip = fields[0];
            context.write(new Text(page), new Text(ip));
        }
    }

    public static class IPReducer extends Reducer<Text, Text, Text, Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            Set<String> ips = new HashSet<>();
            for (Text value : values) {
                ips.add(value.toString());
            }
            context.write(key, new Text(String.valueOf(ips.size())));
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(getConf());
        job.setJarByClass(UniqueIPCount.class);
        job.setMapperClass(IPMapper.class);
        job.setReducerClass(IPReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path("/input"));
        FileOutputFormat.setOutputPath(job, new Path("/output/ip"));
        return job.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new UniqueIPCount(), args);
        System.exit(exitCode);
    }
}

运行MapReduce程序，从HDFS下载结果文件到本地：

hadoop jar UniqueIPCount.jar /input /output/ip

hdfs dfs -get /output/ip /root/internetlogs/ip

4.编写程序进行每小时访问网站的次数统计，结果保存至本地/root/internetlogs/time/目录下part-00000文件中
操作环境：Hadoop/Hive/Spark

public class HourlyVisitCount extends Configured implements Tool {
    public static class TimeMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split(" ");
            String timestamp = fields[3];
            String hour = timestamp.substring(11, 13);
            String date = timestamp.substring(1, 11);
            String hourlyKey = date + hour;
            context.write(new Text(hourlyKey), new IntWritable(1));
        }
    }

    public static class TimeReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(getConf());
        job.setJarByClass(HourlyVisitCount.class);
        job.setMapperClass(TimeMapper.class);
        job.setReducerClass(TimeReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path("/input"));
        FileOutputFormat.setOutputPath(job, new Path("/output/time"));
        return job.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new HourlyVisitCount(), args);
        System.exit(exitCode);
    }
}

运行MapReduce程序，从HDFS下载结果文件到本地：

hadoop jar HourlyVisitCount.jar /input /output/time

hdfs dfs -get /output/time /root/internetlogs/time

5.编写程序进行访问网站的浏览器标识统计，结果保存至本地/root/internetlogs/browser/目录下part-00000文件中，具体查看步骤说明
操作环境：Hadoop/Hive/Spark

public class BrowserCount extends Configured implements Tool {
    public static class BrowserMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split(" ");
            String browser = fields[7];
            context.write(new Text(browser), new IntWritable(1));
        }
    }

    public static class BrowserReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(getConf());
        job.setJarByClass(BrowserCount.class);
        job.setMapperClass(BrowserMapper.class);
        job.setReducerClass(BrowserReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path("/input"));
        FileOutputFormat.setOutputPath(job, new Path("/output/browser"));
        return job.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new BrowserCount(), args);
        System.exit(exitCode);
    }
}

运行MapReduce程序，从HDFS下载结果文件到本地：

hadoop jar BrowserCount.jar /input /output/browser

hdfs dfs -get /output/browser /root/internetlogs/browser