Hadoop CardCount, WordCount
parent
793ba40dda
commit
017680ce25
|
@ -0,0 +1,55 @@
|
|||
# ---> Eclipse
|
||||
*.pydevproject
|
||||
.metadata
|
||||
.gradle
|
||||
bin/
|
||||
tmp/
|
||||
*.tmp
|
||||
*.bak
|
||||
*.swp
|
||||
*~.nib
|
||||
local.properties
|
||||
.settings/
|
||||
.loadpath
|
||||
|
||||
# Eclipse Core
|
||||
.project
|
||||
|
||||
# External tool builders
|
||||
.externalToolBuilders/
|
||||
|
||||
# Locally stored "Eclipse launch configurations"
|
||||
*.launch
|
||||
|
||||
# CDT-specific
|
||||
.cproject
|
||||
|
||||
# JDT-specific (Eclipse Java Development Tools)
|
||||
.classpath
|
||||
|
||||
# Java annotation processor (APT)
|
||||
.factorypath
|
||||
|
||||
# PDT-specific
|
||||
.buildpath
|
||||
|
||||
# sbteclipse plugin
|
||||
.target
|
||||
|
||||
# TeXlipse plugin
|
||||
.texlipse
|
||||
|
||||
# ---> Java
|
||||
*.class
|
||||
|
||||
# Mobile Tools for Java (J2ME)
|
||||
.mtj.tmp/
|
||||
|
||||
# Package Files #
|
||||
*.jar
|
||||
*.war
|
||||
*.ear
|
||||
|
||||
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
|
||||
hs_err_pid*
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
|
@ -0,0 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/HadoopSS23.iml" filepath="$PROJECT_DIR$/.idea/HadoopSS23.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1 @@
|
|||
/target/
|
|
@ -0,0 +1,3 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
|
@ -0,0 +1,13 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="CompilerConfiguration">
|
||||
<annotationProcessing>
|
||||
<profile name="Maven default annotation processors profile" enabled="true">
|
||||
<sourceOutputDir name="target/generated-sources/annotations" />
|
||||
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
|
||||
<outputRelativeToContentRoot value="true" />
|
||||
<module name="Hadoop" />
|
||||
</profile>
|
||||
</annotationProcessing>
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,20 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="RemoteRepositoriesConfiguration">
|
||||
<remote-repository>
|
||||
<option name="id" value="central" />
|
||||
<option name="name" value="Central Repository" />
|
||||
<option name="url" value="https://repo.maven.apache.org/maven2" />
|
||||
</remote-repository>
|
||||
<remote-repository>
|
||||
<option name="id" value="central" />
|
||||
<option name="name" value="Maven Central repository" />
|
||||
<option name="url" value="https://repo1.maven.org/maven2" />
|
||||
</remote-repository>
|
||||
<remote-repository>
|
||||
<option name="id" value="jboss.community" />
|
||||
<option name="name" value="JBoss Community repository" />
|
||||
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
|
||||
</remote-repository>
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,12 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ExternalStorageConfigurationManager" enabled="true" />
|
||||
<component name="MavenProjectsManager">
|
||||
<option name="originalFiles">
|
||||
<list>
|
||||
<option value="$PROJECT_DIR$/pom.xml" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="11 (2)" project-jdk-type="JavaSDK" />
|
||||
</project>
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,79 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>de.hsma.bdea</groupId>
|
||||
<artifactId>Hadoop</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
|
||||
<properties>
|
||||
<hadoop.version>3.3.5</hadoop.version>
|
||||
<slf4j.version>1.7.36</slf4j.version>
|
||||
</properties>
|
||||
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-client</artifactId>
|
||||
<version>${hadoop.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-hdfs</artifactId>
|
||||
<version>${hadoop.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>${slf4j.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<version>${slf4j.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.8.0</version>
|
||||
<configuration>
|
||||
<release>7</release>
|
||||
<source>1.8</source>
|
||||
<target>1.8</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<archive>
|
||||
<manifest>
|
||||
<mainClass>
|
||||
de.hsma.bdea.CardCount
|
||||
</mainClass>
|
||||
</manifest>
|
||||
</archive>
|
||||
<descriptorRefs>
|
||||
<descriptorRef>jar-with-dependencies</descriptorRef>
|
||||
</descriptorRefs>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
|
@ -0,0 +1 @@
|
|||
/*output*
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,21 @@
|
|||
package de.hsma.bdea;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
|
||||
public class BigDataKartenGenerator {
|
||||
|
||||
public static void main(String[] args) throws FileNotFoundException {
|
||||
String[] farben = {"Kreuz", "Pik", "Herz", "Karo"};
|
||||
String[] werte = {"2", "3", "4", "5", "6", "7", "8", "9", "10", "Bube", "Dame", "König", "As"};
|
||||
|
||||
System.setOut(new PrintStream(new FileOutputStream("resources/karten2.csv")));
|
||||
|
||||
for (int i = 0; i < 123294; i++) {
|
||||
System.out.println(i + ", " + farben[(int)(Math.random() * 4)] + ", " + werte[(int)(Math.random() * 13)]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
package de.hsma.bdea;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
import org.apache.hadoop.mapreduce.Reducer;
|
||||
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
|
||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||
import org.apache.log4j.BasicConfigurator;
|
||||
|
||||
public class CardCount {
|
||||
|
||||
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
|
||||
private final static IntWritable one = new IntWritable(1);
|
||||
private Text color = new Text();
|
||||
|
||||
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
|
||||
String[] spalten = value.toString().split(",");
|
||||
color.set(spalten[1]);
|
||||
context.write(color, one); // return
|
||||
}
|
||||
}
|
||||
|
||||
public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
|
||||
private IntWritable result = new IntWritable();
|
||||
|
||||
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
|
||||
int sum = 0;
|
||||
for (IntWritable val : values) {
|
||||
sum += val.get();
|
||||
}
|
||||
result.set(sum);
|
||||
context.write(key, result);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
BasicConfigurator.configure(); // Log4j Config oder ConfigFile in Resources Folder
|
||||
System.setProperty("hadoop.home.dir", "/"); // zwingend für Hadoop 3.3.0
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
|
||||
Job job = Job.getInstance(conf, "card count");
|
||||
job.setJarByClass(CardCount.class);
|
||||
job.setMapperClass(TokenizerMapper.class);
|
||||
job.setCombinerClass(IntSumReducer.class); // reduce lokal bei den Mappern
|
||||
job.setReducerClass(IntSumReducer.class); // reduce nach Verteilung bei den Reducern
|
||||
job.setNumReduceTasks(1);
|
||||
job.setOutputKeyClass(Text.class);
|
||||
job.setOutputValueClass(IntWritable.class);
|
||||
|
||||
FileInputFormat.addInputPath(job, new Path("resources/karten.csv"));
|
||||
FileOutputFormat.setOutputPath(job, new Path("resources/karten-output" + System.currentTimeMillis()));
|
||||
|
||||
System.exit(job.waitForCompletion(true) ? 0 : 1);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,99 @@
|
|||
package de.hsma.bdea;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.mapreduce.Mapper;
|
||||
import org.apache.hadoop.mapreduce.Reducer;
|
||||
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
|
||||
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
|
||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||
import org.apache.log4j.BasicConfigurator;
|
||||
|
||||
public class WordCountVL {
|
||||
|
||||
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
|
||||
private final static IntWritable one = new IntWritable(1);
|
||||
private Text word = new Text();
|
||||
|
||||
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
|
||||
String[] woerter = value.toString().split("\\W+");
|
||||
|
||||
for (String wort: woerter) {
|
||||
word.set(wort);
|
||||
context.write(word, one);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static class IdentityMapper extends Mapper<Text, IntWritable, Text, IntWritable> {
|
||||
public void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {
|
||||
context.write(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
|
||||
private IntWritable result = new IntWritable();
|
||||
|
||||
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
|
||||
int sum = 0;
|
||||
for (IntWritable val : values) {
|
||||
sum += val.get();
|
||||
}
|
||||
result.set(sum);
|
||||
context.write(key, result);
|
||||
}
|
||||
}
|
||||
|
||||
public static class DoNothingReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
|
||||
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
|
||||
context.write(key, values.iterator().next());
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
BasicConfigurator.configure(); // Log4j Config oder ConfigFile in Resources Folder
|
||||
System.setProperty("hadoop.home.dir", "/"); // zwingend für Hadoop 3.3.0
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
|
||||
Job job = Job.getInstance(conf, "word count");
|
||||
job.setJarByClass(WordCountVL.class);
|
||||
job.setMapperClass(TokenizerMapper.class);
|
||||
job.setCombinerClass(IntSumReducer.class); // reduce lokal bei den Mappern
|
||||
job.setReducerClass(IntSumReducer.class); // reduce nach Verteilung bei den Reducern
|
||||
job.setNumReduceTasks(4);
|
||||
job.setOutputKeyClass(Text.class);
|
||||
job.setOutputValueClass(IntWritable.class);
|
||||
|
||||
FileInputFormat.addInputPath(job, new Path("resources/klassiker"));
|
||||
|
||||
String output1 = "resources/wordcount-output1-" + System.currentTimeMillis();
|
||||
FileOutputFormat.setOutputPath(job, new Path(output1));
|
||||
job.setOutputFormatClass(SequenceFileOutputFormat.class);
|
||||
|
||||
job.waitForCompletion(true);
|
||||
|
||||
// --- Ende Job 1 ---
|
||||
|
||||
job = Job.getInstance(conf, "word count sort");
|
||||
job.setJarByClass(WordCountVL.class);
|
||||
job.setMapperClass(IdentityMapper.class);
|
||||
job.setReducerClass(DoNothingReducer.class);
|
||||
job.setNumReduceTasks(1);
|
||||
job.setOutputKeyClass(Text.class);
|
||||
job.setOutputValueClass(IntWritable.class);
|
||||
|
||||
FileInputFormat.addInputPath(job, new Path(output1+"/part-r-*"));
|
||||
job.setInputFormatClass(SequenceFileInputFormat.class);
|
||||
FileOutputFormat.setOutputPath(job, new Path("resources/wordcount-output2-" + System.currentTimeMillis()));
|
||||
|
||||
job.waitForCompletion(true);
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
# HadoopSS23
|
||||
|
||||
Basierend auf Materialien von Prof. Hummel (siehe https://git.informatik.hs-mannheim.de/o.hummel/HadoopSS22/).
|
||||
|
||||
Enthält
|
||||
* CardCount
|
||||
* WordCount
|
||||
* WordCount + Counter
|
Loading…
Reference in New Issue