Hadoop CardCount, WordCount

2023-04-04 10:39:00 +02:00 · 2023-04-04 10:39:00 +02:00 · 017680ce25
parent 793ba40dda
commit 017680ce25
22 changed files with 141423 additions and 0 deletions
--- a/HadoopSS23/.gitignore
+++ b/HadoopSS23/.gitignore
@ -0,0 +1,55 @@
+# ---> Eclipse
+*.pydevproject
+.metadata
+.gradle
+bin/
+tmp/
+*.tmp
+*.bak
+*.swp
+*~.nib
+local.properties
+.settings/
+.loadpath
+
+# Eclipse Core
+.project
+
+# External tool builders
+.externalToolBuilders/
+
+# Locally stored "Eclipse launch configurations"
+*.launch
+
+# CDT-specific
+.cproject
+
+# JDT-specific (Eclipse Java Development Tools)
+.classpath
+
+# Java annotation processor (APT)
+.factorypath
+
+# PDT-specific
+.buildpath
+
+# sbteclipse plugin
+.target
+
+# TeXlipse plugin
+.texlipse
+
+# ---> Java
+*.class
+
+# Mobile Tools for Java (J2ME)
+.mtj.tmp/
+
+# Package Files #
+*.jar
+*.war
+*.ear
+
+# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
+hs_err_pid*
+
--- a/HadoopSS23/.idea/.gitignore
+++ b/HadoopSS23/.idea/.gitignore
@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/HadoopSS23/.idea/HadoopSS23.iml
+++ b/HadoopSS23/.idea/HadoopSS23.iml
@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/HadoopSS23/.idea/modules.xml
+++ b/HadoopSS23/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/HadoopSS23.iml" filepath="$PROJECT_DIR$/.idea/HadoopSS23.iml" />
+    </modules>
+  </component>
+</project>
--- a/HadoopSS23/.idea/vcs.xml
+++ b/HadoopSS23/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
+  </component>
+</project>
--- a/HadoopSS23/Hadoop/.gitignore
+++ b/HadoopSS23/Hadoop/.gitignore
@ -0,0 +1 @@
+/target/
--- a/HadoopSS23/Hadoop/.idea/.gitignore
+++ b/HadoopSS23/Hadoop/.idea/.gitignore
@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/HadoopSS23/Hadoop/.idea/compiler.xml
+++ b/HadoopSS23/Hadoop/.idea/compiler.xml
@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CompilerConfiguration">
+    <annotationProcessing>
+      <profile name="Maven default annotation processors profile" enabled="true">
+        <sourceOutputDir name="target/generated-sources/annotations" />
+        <sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
+        <outputRelativeToContentRoot value="true" />
+        <module name="Hadoop" />
+      </profile>
+    </annotationProcessing>
+  </component>
+</project>
--- a/HadoopSS23/Hadoop/.idea/jarRepositories.xml
+++ b/HadoopSS23/Hadoop/.idea/jarRepositories.xml
@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="RemoteRepositoriesConfiguration">
+    <remote-repository>
+      <option name="id" value="central" />
+      <option name="name" value="Central Repository" />
+      <option name="url" value="https://repo.maven.apache.org/maven2" />
+    </remote-repository>
+    <remote-repository>
+      <option name="id" value="central" />
+      <option name="name" value="Maven Central repository" />
+      <option name="url" value="https://repo1.maven.org/maven2" />
+    </remote-repository>
+    <remote-repository>
+      <option name="id" value="jboss.community" />
+      <option name="name" value="JBoss Community repository" />
+      <option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
+    </remote-repository>
+  </component>
+</project>
--- a/HadoopSS23/Hadoop/.idea/misc.xml
+++ b/HadoopSS23/Hadoop/.idea/misc.xml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ExternalStorageConfigurationManager" enabled="true" />
+  <component name="MavenProjectsManager">
+    <option name="originalFiles">
+      <list>
+        <option value="$PROJECT_DIR$/pom.xml" />
+      </list>
+    </option>
+  </component>
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="11 (2)" project-jdk-type="JavaSDK" />
+</project>
--- a/HadoopSS23/Hadoop/.idea/vcs.xml
+++ b/HadoopSS23/Hadoop/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
+  </component>
+</project>
--- a/HadoopSS23/Hadoop/pom.xml
+++ b/HadoopSS23/Hadoop/pom.xml
@ -0,0 +1,79 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+	<groupId>de.hsma.bdea</groupId>
+	<artifactId>Hadoop</artifactId>
+	<version>0.0.1-SNAPSHOT</version>
+
+	<properties>
+		<hadoop.version>3.3.5</hadoop.version>
+		<slf4j.version>1.7.36</slf4j.version>
+	</properties>
+
+
+	<dependencies>
+		<dependency>
+			<groupId>org.apache.hadoop</groupId>
+			<artifactId>hadoop-client</artifactId>
+			<version>${hadoop.version}</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.apache.hadoop</groupId>
+			<artifactId>hadoop-hdfs</artifactId>
+			<version>${hadoop.version}</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.slf4j</groupId>
+			<artifactId>slf4j-api</artifactId>
+			<version>${slf4j.version}</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.slf4j</groupId>
+			<artifactId>slf4j-log4j12</artifactId>
+			<version>${slf4j.version}</version>
+		</dependency>
+	</dependencies>
+
+	<build>
+		<plugins>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-compiler-plugin</artifactId>
+				<version>3.8.0</version>
+				<configuration>
+					<release>7</release>
+					<source>1.8</source>
+					<target>1.8</target>
+				</configuration>
+			</plugin>
+
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-assembly-plugin</artifactId>
+				<executions>
+					<execution>
+						<phase>package</phase>
+						<goals>
+							<goal>single</goal>
+						</goals>
+						<configuration>
+							<archive>
+								<manifest>
+									<mainClass>
+										de.hsma.bdea.CardCount
+									</mainClass>
+								</manifest>
+							</archive>
+							<descriptorRefs>
+								<descriptorRef>jar-with-dependencies</descriptorRef>
+							</descriptorRefs>
+						</configuration>
+					</execution>
+				</executions>
+			</plugin>
+		</plugins>
+	</build>
+
+</project>
--- a/HadoopSS23/Hadoop/resources/.gitignore
+++ b/HadoopSS23/Hadoop/resources/.gitignore
@ -0,0 +1 @@
+/*output*
--- a/HadoopSS23/Hadoop/resources/karten.csv
+++ b/HadoopSS23/Hadoop/resources/karten.csv
--- a/HadoopSS23/Hadoop/resources/klassiker/Die-Verwandlung.txt
+++ b/HadoopSS23/Hadoop/resources/klassiker/Die-Verwandlung.txt
--- a/HadoopSS23/Hadoop/resources/klassiker/Faust.txt
+++ b/HadoopSS23/Hadoop/resources/klassiker/Faust.txt
--- a/HadoopSS23/Hadoop/resources/klassiker/Kabale-und-Liebe.txt
+++ b/HadoopSS23/Hadoop/resources/klassiker/Kabale-und-Liebe.txt
--- a/HadoopSS23/Hadoop/resources/klassiker/Unterm-Rad.txt
+++ b/HadoopSS23/Hadoop/resources/klassiker/Unterm-Rad.txt
--- a/HadoopSS23/Hadoop/src/main/java/de/hsma/bdea/BigDataKartenGenerator.java
+++ b/HadoopSS23/Hadoop/src/main/java/de/hsma/bdea/BigDataKartenGenerator.java
@ -0,0 +1,21 @@
+package de.hsma.bdea;
+
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.PrintStream;
+
+public class BigDataKartenGenerator {
+
+	public static void main(String[] args) throws FileNotFoundException {
+		String[] farben = {"Kreuz", "Pik", "Herz", "Karo"};
+		String[] werte = {"2", "3", "4", "5", "6", "7", "8", "9", "10", "Bube", "Dame", "König", "As"};
+		
+		System.setOut(new PrintStream(new FileOutputStream("resources/karten2.csv")));
+		
+		for (int i = 0; i < 123294; i++) {
+			System.out.println(i + ", " + farben[(int)(Math.random() * 4)] + ", " + werte[(int)(Math.random() * 13)]);
+		}
+
+	}
+
+}
--- a/HadoopSS23/Hadoop/src/main/java/de/hsma/bdea/CardCount.java
+++ b/HadoopSS23/Hadoop/src/main/java/de/hsma/bdea/CardCount.java
@ -0,0 +1,62 @@
+package de.hsma.bdea;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.log4j.BasicConfigurator;
+
+public class CardCount {
+
+	public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
+		private final static IntWritable one = new IntWritable(1);
+		private Text color = new Text();
+
+		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
+			String[] spalten = value.toString().split(",");
+			color.set(spalten[1]);
+			context.write(color, one);  // return
+		}
+	}
+
+	public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
+		private IntWritable result = new IntWritable();
+
+		public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
+			int sum = 0;
+			for (IntWritable val : values) {
+				sum += val.get();
+			}
+			result.set(sum);
+			context.write(key, result);
+		}
+	}
+
+	public static void main(String[] args) throws Exception {
+		BasicConfigurator.configure(); 					// Log4j Config oder ConfigFile in Resources Folder
+		System.setProperty("hadoop.home.dir", "/");  	// zwingend für Hadoop 3.3.0
+
+		Configuration conf = new Configuration();
+
+		Job job = Job.getInstance(conf, "card count");
+		job.setJarByClass(CardCount.class);
+		job.setMapperClass(TokenizerMapper.class);
+		job.setCombinerClass(IntSumReducer.class);	// reduce lokal bei den Mappern
+		job.setReducerClass(IntSumReducer.class);	// reduce nach Verteilung bei den Reducern
+		job.setNumReduceTasks(1);
+		job.setOutputKeyClass(Text.class);
+		job.setOutputValueClass(IntWritable.class);
+		
+		FileInputFormat.addInputPath(job, new Path("resources/karten.csv"));
+		FileOutputFormat.setOutputPath(job, new Path("resources/karten-output" + System.currentTimeMillis()));
+
+		System.exit(job.waitForCompletion(true) ? 0 : 1);
+	}
+}
--- a/HadoopSS23/Hadoop/src/main/java/de/hsma/bdea/WordCountVL.java
+++ b/HadoopSS23/Hadoop/src/main/java/de/hsma/bdea/WordCountVL.java
@ -0,0 +1,99 @@
+package de.hsma.bdea;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.log4j.BasicConfigurator;
+
+public class WordCountVL {
+
+	public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
+		private final static IntWritable one = new IntWritable(1);
+		private Text word = new Text();
+
+		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
+			String[] woerter = value.toString().split("\\W+");
+
+			for (String wort: woerter) {
+				word.set(wort);
+				context.write(word, one); 
+			}
+		}
+	}
+
+	public static class IdentityMapper extends Mapper<Text, IntWritable, Text, IntWritable> {
+		public void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {
+			context.write(key, value); 
+		}
+	}
+
+	public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
+		private IntWritable result = new IntWritable();
+
+		public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
+			int sum = 0;
+			for (IntWritable val : values) {
+				sum += val.get();
+			}
+			result.set(sum);
+			context.write(key, result);
+		}
+	}
+	
+	public static class DoNothingReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
+		public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
+			context.write(key, values.iterator().next());
+		}
+	}
+
+	public static void main(String[] args) throws Exception {
+		BasicConfigurator.configure(); 					// Log4j Config oder ConfigFile in Resources Folder
+		System.setProperty("hadoop.home.dir", "/");  	// zwingend für Hadoop 3.3.0
+
+		Configuration conf = new Configuration();
+
+		Job job = Job.getInstance(conf, "word count");
+		job.setJarByClass(WordCountVL.class);
+		job.setMapperClass(TokenizerMapper.class);
+		job.setCombinerClass(IntSumReducer.class);	// reduce lokal bei den Mappern
+		job.setReducerClass(IntSumReducer.class);	// reduce nach Verteilung bei den Reducern
+		job.setNumReduceTasks(4);
+		job.setOutputKeyClass(Text.class);
+		job.setOutputValueClass(IntWritable.class);
+
+		FileInputFormat.addInputPath(job, new Path("resources/klassiker"));
+
+		String output1 = "resources/wordcount-output1-" + System.currentTimeMillis();
+		FileOutputFormat.setOutputPath(job, new Path(output1));
+		job.setOutputFormatClass(SequenceFileOutputFormat.class);
+
+		job.waitForCompletion(true);
+
+		// --- Ende Job 1 ---
+
+		job = Job.getInstance(conf, "word count sort");
+		job.setJarByClass(WordCountVL.class);
+		job.setMapperClass(IdentityMapper.class);
+		job.setReducerClass(DoNothingReducer.class);
+		job.setNumReduceTasks(1);
+		job.setOutputKeyClass(Text.class);
+		job.setOutputValueClass(IntWritable.class);
+
+		FileInputFormat.addInputPath(job, new Path(output1+"/part-r-*"));
+		job.setInputFormatClass(SequenceFileInputFormat.class);
+		FileOutputFormat.setOutputPath(job, new Path("resources/wordcount-output2-" + System.currentTimeMillis()));
+
+		job.waitForCompletion(true);
+
+	}
+}
--- a/HadoopSS23/README.md
+++ b/HadoopSS23/README.md
@ -0,0 +1,8 @@
+# HadoopSS23
+
+Basierend auf Materialien von Prof. Hummel (siehe https://git.informatik.hs-mannheim.de/o.hummel/HadoopSS22/).
+
+Enthält
+* CardCount
+* WordCount
+* WordCount + Counter