diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..981e66f --- /dev/null +++ b/.gitignore @@ -0,0 +1,36 @@ +ignite/ +activemq-data/ +.idea/ +target/ +.classpath +.project +.classpath +.settings/ +*.aux +*.glo +*.idx +*.log +*.toc +*.ist +*.acn +*.acr +*.alg +*.bbl +*.blg +*.dvi +*.glg +*.gls +*.ilg +*.ind +*.lof +*.lot +*.maf +*.mtc +*.mtc1 +*.out +*.synctex.gz +*.scg + +#IntelliJ +*.iml +/.idea/ diff --git a/HadoopSS23/.gitignore b/HadoopSS23/.gitignore deleted file mode 100644 index f4714b5..0000000 --- a/HadoopSS23/.gitignore +++ /dev/null @@ -1,55 +0,0 @@ -# ---> Eclipse -*.pydevproject -.metadata -.gradle -bin/ -tmp/ -*.tmp -*.bak -*.swp -*~.nib -local.properties -.settings/ -.loadpath - -# Eclipse Core -.project - -# External tool builders -.externalToolBuilders/ - -# Locally stored "Eclipse launch configurations" -*.launch - -# CDT-specific -.cproject - -# JDT-specific (Eclipse Java Development Tools) -.classpath - -# Java annotation processor (APT) -.factorypath - -# PDT-specific -.buildpath - -# sbteclipse plugin -.target - -# TeXlipse plugin -.texlipse - -# ---> Java -*.class - -# Mobile Tools for Java (J2ME) -.mtj.tmp/ - -# Package Files # -*.jar -*.war -*.ear - -# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml -hs_err_pid* - diff --git a/HadoopSS23/.idea/modules.xml b/HadoopSS23/.idea/modules.xml index cd9b5fc..c10454a 100644 --- a/HadoopSS23/.idea/modules.xml +++ b/HadoopSS23/.idea/modules.xml @@ -2,6 +2,7 @@ + diff --git a/HadoopSS23/Hadoop/.gitignore b/HadoopSS23/Hadoop/.gitignore deleted file mode 100644 index b83d222..0000000 --- a/HadoopSS23/Hadoop/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/target/ diff --git a/HadoopSS23/Hadoop/.idea/vcs.xml b/HadoopSS23/Hadoop/.idea/vcs.xml index 6c0b863..64713b8 100644 --- a/HadoopSS23/Hadoop/.idea/vcs.xml +++ b/HadoopSS23/Hadoop/.idea/vcs.xml @@ -1,6 +1,7 @@ + \ No newline at end of file diff --git a/HadoopSS23/Hadoop/src/main/java/de/hsma/bdea/WordCountVLCounter.java b/HadoopSS23/Hadoop/src/main/java/de/hsma/bdea/WordCountVLCounter.java new file mode 100644 index 0000000..c059b36 --- /dev/null +++ b/HadoopSS23/Hadoop/src/main/java/de/hsma/bdea/WordCountVLCounter.java @@ -0,0 +1,99 @@ +package de.hsma.bdea; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.TaskCounter; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; +import org.apache.log4j.BasicConfigurator; + +import java.io.IOException; + +public class WordCountVLCounter { + + public static class TokenizerMapper extends Mapper { + private final static IntWritable one = new IntWritable(1); + private Text word = new Text(); + + public void map(Object key, Text value, Context context) throws IOException, InterruptedException { + String[] woerter = value.toString().split("\\W+"); + + for (String wort: woerter) { + word.set(wort); + context.write(word, one); + } + } + } + + public static class IntSumCombiner extends Reducer { + private IntWritable result = new IntWritable(); + + public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { + int sum = 0; + for (IntWritable val : values) { + sum += val.get(); + } + result.set(sum); + context.write(key, result); + } + } + + // wir könnten auch von IntSumCombiner erben um funktionale Klone zu reduzieren + public static class IntSumReducer extends Reducer { + private IntWritable result = new IntWritable(); + + public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { + int sum = 0; + for (IntWritable val : values) { + sum += val.get(); + } + result.set(sum); + context.write(key, result); + + // sum up words + context.getCounter("mygroup", "words").increment(1); + } + } + + public static void main(String[] args) throws Exception { + BasicConfigurator.configure(); // Log4j Config oder ConfigFile in Resources Folder + System.setProperty("hadoop.home.dir", "/"); // zwingend für Hadoop 3.3.0 + + Configuration conf = new Configuration(); + + Job job = Job.getInstance(conf, "word count"); + job.setJarByClass(WordCountVLCounter.class); + job.setMapperClass(TokenizerMapper.class); + job.setCombinerClass(IntSumCombiner.class); // reduce lokal bei den Mappern + job.setReducerClass(IntSumReducer.class); // reduce nach Verteilung bei den Reducern + job.setNumReduceTasks(4); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(IntWritable.class); + + FileInputFormat.addInputPath(job, new Path("resources/klassiker")); + + String output1 = "resources/wordcount-output1-" + System.currentTimeMillis(); + FileOutputFormat.setOutputPath(job, new Path(output1)); + job.setOutputFormatClass(SequenceFileOutputFormat.class); + + job.waitForCompletion(true); + + // -- Hadoop Counter + // default counter (siehe TaskCounter.* für andere) + long outputRecords = job.getCounters().findCounter(TaskCounter.REDUCE_OUTPUT_RECORDS).getValue(); + //job.getCounters().findCounter(TaskCounter.).setValue(); + + // custom counter (Wichtig: nur in Reducer und nicht in Combiner + long words = job.getCounters().findCounter("mygroup", "words").getValue(); + + // Anzahl ist dieselbe + System.out.println("#Records => " + outputRecords); + System.out.println("#Words => " + words); + } +} \ No newline at end of file