Apache Spark Beispiele
parent
db7de9cc8f
commit
18147cd7fe
|
@ -53,7 +53,7 @@ public class SecondarySorting {
|
||||||
st.nextToken(); // ignore day
|
st.nextToken(); // ignore day
|
||||||
String temp = st.nextToken();
|
String temp = st.nextToken();
|
||||||
|
|
||||||
System.out.println(new Text(yearMonth + ":" + "000".substring(0, 3-temp.length()) + temp).toString() + " -> " + value);
|
System.out.println("Mapper -> " + new Text(yearMonth + ":" + "000".substring(0, 3-temp.length()) + temp).toString() + " -> " + value);
|
||||||
context.write(new Text(yearMonth + ":" + "000".substring(0, 3-temp.length()) + temp), value);
|
context.write(new Text(yearMonth + ":" + "000".substring(0, 3-temp.length()) + temp), value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -84,11 +84,17 @@ public class SecondarySorting {
|
||||||
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
|
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
int c = 0;
|
||||||
|
System.out.println("blub");
|
||||||
for (Text val : values) {
|
for (Text val : values) {
|
||||||
String s = val.toString();
|
String s = val.toString();
|
||||||
System.out.println(s);
|
// note that key changes in each iteration
|
||||||
|
// because of MyGroupingComparator (i.e., groups keys)
|
||||||
|
System.out.println("Reducer -> " + key + " = " + s);
|
||||||
sb.append(s.substring(s.lastIndexOf(",") + 1, s.length()).trim() + " ");
|
sb.append(s.substring(s.lastIndexOf(",") + 1, s.length()).trim() + " ");
|
||||||
|
c++;
|
||||||
}
|
}
|
||||||
|
System.out.println("Reducer (#values = " + c + ")");
|
||||||
|
|
||||||
result.set(sb.toString());
|
result.set(sb.toString());
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,9 @@ public class WordCountVL {
|
||||||
private Text word = new Text();
|
private Text word = new Text();
|
||||||
|
|
||||||
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
|
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
|
||||||
|
//System.out.println(key);
|
||||||
|
//System.out.println(value);
|
||||||
|
|
||||||
String[] woerter = value.toString().split("\\W+");
|
String[] woerter = value.toString().split("\\W+");
|
||||||
|
|
||||||
for (String wort: woerter) {
|
for (String wort: woerter) {
|
||||||
|
@ -52,7 +55,7 @@ public class WordCountVL {
|
||||||
|
|
||||||
public static class DoNothingReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
|
public static class DoNothingReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
|
||||||
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
|
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
|
||||||
context.write(key, values.iterator().next());
|
context.write(key, values.iterator().next()); // nur ein Wert, siehe IdentityMapper.class
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -67,7 +70,7 @@ public class WordCountVL {
|
||||||
job.setMapperClass(TokenizerMapper.class);
|
job.setMapperClass(TokenizerMapper.class);
|
||||||
job.setCombinerClass(IntSumReducer.class); // reduce lokal bei den Mappern
|
job.setCombinerClass(IntSumReducer.class); // reduce lokal bei den Mappern
|
||||||
job.setReducerClass(IntSumReducer.class); // reduce nach Verteilung bei den Reducern
|
job.setReducerClass(IntSumReducer.class); // reduce nach Verteilung bei den Reducern
|
||||||
job.setNumReduceTasks(4);
|
job.setNumReduceTasks(4); // 4 Tasks parallel
|
||||||
job.setOutputKeyClass(Text.class);
|
job.setOutputKeyClass(Text.class);
|
||||||
job.setOutputValueClass(IntWritable.class);
|
job.setOutputValueClass(IntWritable.class);
|
||||||
|
|
||||||
|
@ -81,11 +84,11 @@ public class WordCountVL {
|
||||||
|
|
||||||
// --- Ende Job 1 ---
|
// --- Ende Job 1 ---
|
||||||
|
|
||||||
job = Job.getInstance(conf, "word count sort");
|
job = Job.getInstance(conf, "word count sort"); // natural ordering of keys
|
||||||
job.setJarByClass(WordCountVL.class);
|
job.setJarByClass(WordCountVL.class);
|
||||||
job.setMapperClass(IdentityMapper.class);
|
job.setMapperClass(IdentityMapper.class);
|
||||||
job.setReducerClass(DoNothingReducer.class);
|
job.setReducerClass(DoNothingReducer.class);
|
||||||
job.setNumReduceTasks(1);
|
job.setNumReduceTasks(1); // 1 Task parallel
|
||||||
job.setOutputKeyClass(Text.class);
|
job.setOutputKeyClass(Text.class);
|
||||||
job.setOutputValueClass(IntWritable.class);
|
job.setOutputValueClass(IntWritable.class);
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
# ---> Eclipse
|
||||||
|
*.pydevproject
|
||||||
|
.metadata
|
||||||
|
.gradle
|
||||||
|
bin/
|
||||||
|
tmp/
|
||||||
|
*.tmp
|
||||||
|
*.bak
|
||||||
|
*.swp
|
||||||
|
*~.nib
|
||||||
|
local.properties
|
||||||
|
.settings/
|
||||||
|
.loadpath
|
||||||
|
|
||||||
|
# Eclipse Core
|
||||||
|
.project
|
||||||
|
|
||||||
|
# External tool builders
|
||||||
|
.externalToolBuilders/
|
||||||
|
|
||||||
|
# Locally stored "Eclipse launch configurations"
|
||||||
|
*.launch
|
||||||
|
|
||||||
|
# CDT-specific
|
||||||
|
.cproject
|
||||||
|
|
||||||
|
# JDT-specific (Eclipse Java Development Tools)
|
||||||
|
.classpath
|
||||||
|
|
||||||
|
# Java annotation processor (APT)
|
||||||
|
.factorypath
|
||||||
|
|
||||||
|
# PDT-specific
|
||||||
|
.buildpath
|
||||||
|
|
||||||
|
# sbteclipse plugin
|
||||||
|
.target
|
||||||
|
|
||||||
|
# TeXlipse plugin
|
||||||
|
.texlipse
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
# Spark-It
|
||||||
|
|
Binary file not shown.
|
@ -0,0 +1,18 @@
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.1/apache-maven-3.9.1-bin.zip
|
||||||
|
wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar
|
|
@ -0,0 +1,20 @@
|
||||||
|
# VM Options
|
||||||
|
|
||||||
|
Java 17 (see https://stackoverflow.com/questions/73465937/apache-spark-3-3-0-breaks-on-java-17-with-cannot-access-class-sun-nio-ch-direct)
|
||||||
|
|
||||||
|
```shell
|
||||||
|
--add-opens=java.base/java.lang=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.io=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.net=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.nio=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.util=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/sun.nio.cs=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/sun.security.action=ALL-UNNAMED
|
||||||
|
--add-opens=java.base/sun.util.calendar=ALL-UNNAMED
|
||||||
|
--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED
|
||||||
|
```
|
|
@ -0,0 +1,308 @@
|
||||||
|
#!/bin/sh
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Apache Maven Wrapper startup batch script, version 3.2.0
|
||||||
|
#
|
||||||
|
# Required ENV vars:
|
||||||
|
# ------------------
|
||||||
|
# JAVA_HOME - location of a JDK home dir
|
||||||
|
#
|
||||||
|
# Optional ENV vars
|
||||||
|
# -----------------
|
||||||
|
# MAVEN_OPTS - parameters passed to the Java VM when running Maven
|
||||||
|
# e.g. to debug Maven itself, use
|
||||||
|
# set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
|
||||||
|
# MAVEN_SKIP_RC - flag to disable loading of mavenrc files
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if [ -z "$MAVEN_SKIP_RC" ] ; then
|
||||||
|
|
||||||
|
if [ -f /usr/local/etc/mavenrc ] ; then
|
||||||
|
. /usr/local/etc/mavenrc
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f /etc/mavenrc ] ; then
|
||||||
|
. /etc/mavenrc
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f "$HOME/.mavenrc" ] ; then
|
||||||
|
. "$HOME/.mavenrc"
|
||||||
|
fi
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
# OS specific support. $var _must_ be set to either true or false.
|
||||||
|
cygwin=false;
|
||||||
|
darwin=false;
|
||||||
|
mingw=false
|
||||||
|
case "$(uname)" in
|
||||||
|
CYGWIN*) cygwin=true ;;
|
||||||
|
MINGW*) mingw=true;;
|
||||||
|
Darwin*) darwin=true
|
||||||
|
# Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home
|
||||||
|
# See https://developer.apple.com/library/mac/qa/qa1170/_index.html
|
||||||
|
if [ -z "$JAVA_HOME" ]; then
|
||||||
|
if [ -x "/usr/libexec/java_home" ]; then
|
||||||
|
JAVA_HOME="$(/usr/libexec/java_home)"; export JAVA_HOME
|
||||||
|
else
|
||||||
|
JAVA_HOME="/Library/Java/Home"; export JAVA_HOME
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if [ -z "$JAVA_HOME" ] ; then
|
||||||
|
if [ -r /etc/gentoo-release ] ; then
|
||||||
|
JAVA_HOME=$(java-config --jre-home)
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# For Cygwin, ensure paths are in UNIX format before anything is touched
|
||||||
|
if $cygwin ; then
|
||||||
|
[ -n "$JAVA_HOME" ] &&
|
||||||
|
JAVA_HOME=$(cygpath --unix "$JAVA_HOME")
|
||||||
|
[ -n "$CLASSPATH" ] &&
|
||||||
|
CLASSPATH=$(cygpath --path --unix "$CLASSPATH")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# For Mingw, ensure paths are in UNIX format before anything is touched
|
||||||
|
if $mingw ; then
|
||||||
|
[ -n "$JAVA_HOME" ] && [ -d "$JAVA_HOME" ] &&
|
||||||
|
JAVA_HOME="$(cd "$JAVA_HOME" || (echo "cannot cd into $JAVA_HOME."; exit 1); pwd)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$JAVA_HOME" ]; then
|
||||||
|
javaExecutable="$(which javac)"
|
||||||
|
if [ -n "$javaExecutable" ] && ! [ "$(expr "\"$javaExecutable\"" : '\([^ ]*\)')" = "no" ]; then
|
||||||
|
# readlink(1) is not available as standard on Solaris 10.
|
||||||
|
readLink=$(which readlink)
|
||||||
|
if [ ! "$(expr "$readLink" : '\([^ ]*\)')" = "no" ]; then
|
||||||
|
if $darwin ; then
|
||||||
|
javaHome="$(dirname "\"$javaExecutable\"")"
|
||||||
|
javaExecutable="$(cd "\"$javaHome\"" && pwd -P)/javac"
|
||||||
|
else
|
||||||
|
javaExecutable="$(readlink -f "\"$javaExecutable\"")"
|
||||||
|
fi
|
||||||
|
javaHome="$(dirname "\"$javaExecutable\"")"
|
||||||
|
javaHome=$(expr "$javaHome" : '\(.*\)/bin')
|
||||||
|
JAVA_HOME="$javaHome"
|
||||||
|
export JAVA_HOME
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$JAVACMD" ] ; then
|
||||||
|
if [ -n "$JAVA_HOME" ] ; then
|
||||||
|
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
|
||||||
|
# IBM's JDK on AIX uses strange locations for the executables
|
||||||
|
JAVACMD="$JAVA_HOME/jre/sh/java"
|
||||||
|
else
|
||||||
|
JAVACMD="$JAVA_HOME/bin/java"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
JAVACMD="$(\unset -f command 2>/dev/null; \command -v java)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -x "$JAVACMD" ] ; then
|
||||||
|
echo "Error: JAVA_HOME is not defined correctly." >&2
|
||||||
|
echo " We cannot execute $JAVACMD" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$JAVA_HOME" ] ; then
|
||||||
|
echo "Warning: JAVA_HOME environment variable is not set."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# traverses directory structure from process work directory to filesystem root
|
||||||
|
# first directory with .mvn subdirectory is considered project base directory
|
||||||
|
find_maven_basedir() {
|
||||||
|
if [ -z "$1" ]
|
||||||
|
then
|
||||||
|
echo "Path not specified to find_maven_basedir"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
basedir="$1"
|
||||||
|
wdir="$1"
|
||||||
|
while [ "$wdir" != '/' ] ; do
|
||||||
|
if [ -d "$wdir"/.mvn ] ; then
|
||||||
|
basedir=$wdir
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
# workaround for JBEAP-8937 (on Solaris 10/Sparc)
|
||||||
|
if [ -d "${wdir}" ]; then
|
||||||
|
wdir=$(cd "$wdir/.." || exit 1; pwd)
|
||||||
|
fi
|
||||||
|
# end of workaround
|
||||||
|
done
|
||||||
|
printf '%s' "$(cd "$basedir" || exit 1; pwd)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# concatenates all lines of a file
|
||||||
|
concat_lines() {
|
||||||
|
if [ -f "$1" ]; then
|
||||||
|
# Remove \r in case we run on Windows within Git Bash
|
||||||
|
# and check out the repository with auto CRLF management
|
||||||
|
# enabled. Otherwise, we may read lines that are delimited with
|
||||||
|
# \r\n and produce $'-Xarg\r' rather than -Xarg due to word
|
||||||
|
# splitting rules.
|
||||||
|
tr -s '\r\n' ' ' < "$1"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
log() {
|
||||||
|
if [ "$MVNW_VERBOSE" = true ]; then
|
||||||
|
printf '%s\n' "$1"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
BASE_DIR=$(find_maven_basedir "$(dirname "$0")")
|
||||||
|
if [ -z "$BASE_DIR" ]; then
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"}; export MAVEN_PROJECTBASEDIR
|
||||||
|
log "$MAVEN_PROJECTBASEDIR"
|
||||||
|
|
||||||
|
##########################################################################################
|
||||||
|
# Extension to allow automatically downloading the maven-wrapper.jar from Maven-central
|
||||||
|
# This allows using the maven wrapper in projects that prohibit checking in binary data.
|
||||||
|
##########################################################################################
|
||||||
|
wrapperJarPath="$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar"
|
||||||
|
if [ -r "$wrapperJarPath" ]; then
|
||||||
|
log "Found $wrapperJarPath"
|
||||||
|
else
|
||||||
|
log "Couldn't find $wrapperJarPath, downloading it ..."
|
||||||
|
|
||||||
|
if [ -n "$MVNW_REPOURL" ]; then
|
||||||
|
wrapperUrl="$MVNW_REPOURL/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar"
|
||||||
|
else
|
||||||
|
wrapperUrl="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar"
|
||||||
|
fi
|
||||||
|
while IFS="=" read -r key value; do
|
||||||
|
# Remove '\r' from value to allow usage on windows as IFS does not consider '\r' as a separator ( considers space, tab, new line ('\n'), and custom '=' )
|
||||||
|
safeValue=$(echo "$value" | tr -d '\r')
|
||||||
|
case "$key" in (wrapperUrl) wrapperUrl="$safeValue"; break ;;
|
||||||
|
esac
|
||||||
|
done < "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.properties"
|
||||||
|
log "Downloading from: $wrapperUrl"
|
||||||
|
|
||||||
|
if $cygwin; then
|
||||||
|
wrapperJarPath=$(cygpath --path --windows "$wrapperJarPath")
|
||||||
|
fi
|
||||||
|
|
||||||
|
if command -v wget > /dev/null; then
|
||||||
|
log "Found wget ... using wget"
|
||||||
|
[ "$MVNW_VERBOSE" = true ] && QUIET="" || QUIET="--quiet"
|
||||||
|
if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then
|
||||||
|
wget $QUIET "$wrapperUrl" -O "$wrapperJarPath" || rm -f "$wrapperJarPath"
|
||||||
|
else
|
||||||
|
wget $QUIET --http-user="$MVNW_USERNAME" --http-password="$MVNW_PASSWORD" "$wrapperUrl" -O "$wrapperJarPath" || rm -f "$wrapperJarPath"
|
||||||
|
fi
|
||||||
|
elif command -v curl > /dev/null; then
|
||||||
|
log "Found curl ... using curl"
|
||||||
|
[ "$MVNW_VERBOSE" = true ] && QUIET="" || QUIET="--silent"
|
||||||
|
if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then
|
||||||
|
curl $QUIET -o "$wrapperJarPath" "$wrapperUrl" -f -L || rm -f "$wrapperJarPath"
|
||||||
|
else
|
||||||
|
curl $QUIET --user "$MVNW_USERNAME:$MVNW_PASSWORD" -o "$wrapperJarPath" "$wrapperUrl" -f -L || rm -f "$wrapperJarPath"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "Falling back to using Java to download"
|
||||||
|
javaSource="$MAVEN_PROJECTBASEDIR/.mvn/wrapper/MavenWrapperDownloader.java"
|
||||||
|
javaClass="$MAVEN_PROJECTBASEDIR/.mvn/wrapper/MavenWrapperDownloader.class"
|
||||||
|
# For Cygwin, switch paths to Windows format before running javac
|
||||||
|
if $cygwin; then
|
||||||
|
javaSource=$(cygpath --path --windows "$javaSource")
|
||||||
|
javaClass=$(cygpath --path --windows "$javaClass")
|
||||||
|
fi
|
||||||
|
if [ -e "$javaSource" ]; then
|
||||||
|
if [ ! -e "$javaClass" ]; then
|
||||||
|
log " - Compiling MavenWrapperDownloader.java ..."
|
||||||
|
("$JAVA_HOME/bin/javac" "$javaSource")
|
||||||
|
fi
|
||||||
|
if [ -e "$javaClass" ]; then
|
||||||
|
log " - Running MavenWrapperDownloader.java ..."
|
||||||
|
("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$wrapperUrl" "$wrapperJarPath") || rm -f "$wrapperJarPath"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
##########################################################################################
|
||||||
|
# End of extension
|
||||||
|
##########################################################################################
|
||||||
|
|
||||||
|
# If specified, validate the SHA-256 sum of the Maven wrapper jar file
|
||||||
|
wrapperSha256Sum=""
|
||||||
|
while IFS="=" read -r key value; do
|
||||||
|
case "$key" in (wrapperSha256Sum) wrapperSha256Sum=$value; break ;;
|
||||||
|
esac
|
||||||
|
done < "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.properties"
|
||||||
|
if [ -n "$wrapperSha256Sum" ]; then
|
||||||
|
wrapperSha256Result=false
|
||||||
|
if command -v sha256sum > /dev/null; then
|
||||||
|
if echo "$wrapperSha256Sum $wrapperJarPath" | sha256sum -c > /dev/null 2>&1; then
|
||||||
|
wrapperSha256Result=true
|
||||||
|
fi
|
||||||
|
elif command -v shasum > /dev/null; then
|
||||||
|
if echo "$wrapperSha256Sum $wrapperJarPath" | shasum -a 256 -c > /dev/null 2>&1; then
|
||||||
|
wrapperSha256Result=true
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available."
|
||||||
|
echo "Please install either command, or disable validation by removing 'wrapperSha256Sum' from your maven-wrapper.properties."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ $wrapperSha256Result = false ]; then
|
||||||
|
echo "Error: Failed to validate Maven wrapper SHA-256, your Maven wrapper might be compromised." >&2
|
||||||
|
echo "Investigate or delete $wrapperJarPath to attempt a clean download." >&2
|
||||||
|
echo "If you updated your Maven version, you need to update the specified wrapperSha256Sum property." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS"
|
||||||
|
|
||||||
|
# For Cygwin, switch paths to Windows format before running java
|
||||||
|
if $cygwin; then
|
||||||
|
[ -n "$JAVA_HOME" ] &&
|
||||||
|
JAVA_HOME=$(cygpath --path --windows "$JAVA_HOME")
|
||||||
|
[ -n "$CLASSPATH" ] &&
|
||||||
|
CLASSPATH=$(cygpath --path --windows "$CLASSPATH")
|
||||||
|
[ -n "$MAVEN_PROJECTBASEDIR" ] &&
|
||||||
|
MAVEN_PROJECTBASEDIR=$(cygpath --path --windows "$MAVEN_PROJECTBASEDIR")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Provide a "standardized" way to retrieve the CLI args that will
|
||||||
|
# work with both Windows and non-Windows executions.
|
||||||
|
MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $*"
|
||||||
|
export MAVEN_CMD_LINE_ARGS
|
||||||
|
|
||||||
|
WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
|
||||||
|
|
||||||
|
# shellcheck disable=SC2086 # safe args
|
||||||
|
exec "$JAVACMD" \
|
||||||
|
$MAVEN_OPTS \
|
||||||
|
$MAVEN_DEBUG_OPTS \
|
||||||
|
-classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \
|
||||||
|
"-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \
|
||||||
|
${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@"
|
|
@ -0,0 +1,205 @@
|
||||||
|
@REM ----------------------------------------------------------------------------
|
||||||
|
@REM Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
@REM or more contributor license agreements. See the NOTICE file
|
||||||
|
@REM distributed with this work for additional information
|
||||||
|
@REM regarding copyright ownership. The ASF licenses this file
|
||||||
|
@REM to you under the Apache License, Version 2.0 (the
|
||||||
|
@REM "License"); you may not use this file except in compliance
|
||||||
|
@REM with the License. You may obtain a copy of the License at
|
||||||
|
@REM
|
||||||
|
@REM http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
@REM
|
||||||
|
@REM Unless required by applicable law or agreed to in writing,
|
||||||
|
@REM software distributed under the License is distributed on an
|
||||||
|
@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
@REM KIND, either express or implied. See the License for the
|
||||||
|
@REM specific language governing permissions and limitations
|
||||||
|
@REM under the License.
|
||||||
|
@REM ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@REM ----------------------------------------------------------------------------
|
||||||
|
@REM Apache Maven Wrapper startup batch script, version 3.2.0
|
||||||
|
@REM
|
||||||
|
@REM Required ENV vars:
|
||||||
|
@REM JAVA_HOME - location of a JDK home dir
|
||||||
|
@REM
|
||||||
|
@REM Optional ENV vars
|
||||||
|
@REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands
|
||||||
|
@REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending
|
||||||
|
@REM MAVEN_OPTS - parameters passed to the Java VM when running Maven
|
||||||
|
@REM e.g. to debug Maven itself, use
|
||||||
|
@REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
|
||||||
|
@REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files
|
||||||
|
@REM ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on'
|
||||||
|
@echo off
|
||||||
|
@REM set title of command window
|
||||||
|
title %0
|
||||||
|
@REM enable echoing by setting MAVEN_BATCH_ECHO to 'on'
|
||||||
|
@if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO%
|
||||||
|
|
||||||
|
@REM set %HOME% to equivalent of $HOME
|
||||||
|
if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%")
|
||||||
|
|
||||||
|
@REM Execute a user defined script before this one
|
||||||
|
if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre
|
||||||
|
@REM check for pre script, once with legacy .bat ending and once with .cmd ending
|
||||||
|
if exist "%USERPROFILE%\mavenrc_pre.bat" call "%USERPROFILE%\mavenrc_pre.bat" %*
|
||||||
|
if exist "%USERPROFILE%\mavenrc_pre.cmd" call "%USERPROFILE%\mavenrc_pre.cmd" %*
|
||||||
|
:skipRcPre
|
||||||
|
|
||||||
|
@setlocal
|
||||||
|
|
||||||
|
set ERROR_CODE=0
|
||||||
|
|
||||||
|
@REM To isolate internal variables from possible post scripts, we use another setlocal
|
||||||
|
@setlocal
|
||||||
|
|
||||||
|
@REM ==== START VALIDATION ====
|
||||||
|
if not "%JAVA_HOME%" == "" goto OkJHome
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo Error: JAVA_HOME not found in your environment. >&2
|
||||||
|
echo Please set the JAVA_HOME variable in your environment to match the >&2
|
||||||
|
echo location of your Java installation. >&2
|
||||||
|
echo.
|
||||||
|
goto error
|
||||||
|
|
||||||
|
:OkJHome
|
||||||
|
if exist "%JAVA_HOME%\bin\java.exe" goto init
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo Error: JAVA_HOME is set to an invalid directory. >&2
|
||||||
|
echo JAVA_HOME = "%JAVA_HOME%" >&2
|
||||||
|
echo Please set the JAVA_HOME variable in your environment to match the >&2
|
||||||
|
echo location of your Java installation. >&2
|
||||||
|
echo.
|
||||||
|
goto error
|
||||||
|
|
||||||
|
@REM ==== END VALIDATION ====
|
||||||
|
|
||||||
|
:init
|
||||||
|
|
||||||
|
@REM Find the project base dir, i.e. the directory that contains the folder ".mvn".
|
||||||
|
@REM Fallback to current working directory if not found.
|
||||||
|
|
||||||
|
set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR%
|
||||||
|
IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir
|
||||||
|
|
||||||
|
set EXEC_DIR=%CD%
|
||||||
|
set WDIR=%EXEC_DIR%
|
||||||
|
:findBaseDir
|
||||||
|
IF EXIST "%WDIR%"\.mvn goto baseDirFound
|
||||||
|
cd ..
|
||||||
|
IF "%WDIR%"=="%CD%" goto baseDirNotFound
|
||||||
|
set WDIR=%CD%
|
||||||
|
goto findBaseDir
|
||||||
|
|
||||||
|
:baseDirFound
|
||||||
|
set MAVEN_PROJECTBASEDIR=%WDIR%
|
||||||
|
cd "%EXEC_DIR%"
|
||||||
|
goto endDetectBaseDir
|
||||||
|
|
||||||
|
:baseDirNotFound
|
||||||
|
set MAVEN_PROJECTBASEDIR=%EXEC_DIR%
|
||||||
|
cd "%EXEC_DIR%"
|
||||||
|
|
||||||
|
:endDetectBaseDir
|
||||||
|
|
||||||
|
IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig
|
||||||
|
|
||||||
|
@setlocal EnableExtensions EnableDelayedExpansion
|
||||||
|
for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a
|
||||||
|
@endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS%
|
||||||
|
|
||||||
|
:endReadAdditionalConfig
|
||||||
|
|
||||||
|
SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe"
|
||||||
|
set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar"
|
||||||
|
set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
|
||||||
|
|
||||||
|
set WRAPPER_URL="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar"
|
||||||
|
|
||||||
|
FOR /F "usebackq tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO (
|
||||||
|
IF "%%A"=="wrapperUrl" SET WRAPPER_URL=%%B
|
||||||
|
)
|
||||||
|
|
||||||
|
@REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central
|
||||||
|
@REM This allows using the maven wrapper in projects that prohibit checking in binary data.
|
||||||
|
if exist %WRAPPER_JAR% (
|
||||||
|
if "%MVNW_VERBOSE%" == "true" (
|
||||||
|
echo Found %WRAPPER_JAR%
|
||||||
|
)
|
||||||
|
) else (
|
||||||
|
if not "%MVNW_REPOURL%" == "" (
|
||||||
|
SET WRAPPER_URL="%MVNW_REPOURL%/org/apache/maven/wrapper/maven-wrapper/3.2.0/maven-wrapper-3.2.0.jar"
|
||||||
|
)
|
||||||
|
if "%MVNW_VERBOSE%" == "true" (
|
||||||
|
echo Couldn't find %WRAPPER_JAR%, downloading it ...
|
||||||
|
echo Downloading from: %WRAPPER_URL%
|
||||||
|
)
|
||||||
|
|
||||||
|
powershell -Command "&{"^
|
||||||
|
"$webclient = new-object System.Net.WebClient;"^
|
||||||
|
"if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^
|
||||||
|
"$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^
|
||||||
|
"}"^
|
||||||
|
"[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%WRAPPER_URL%', '%WRAPPER_JAR%')"^
|
||||||
|
"}"
|
||||||
|
if "%MVNW_VERBOSE%" == "true" (
|
||||||
|
echo Finished downloading %WRAPPER_JAR%
|
||||||
|
)
|
||||||
|
)
|
||||||
|
@REM End of extension
|
||||||
|
|
||||||
|
@REM If specified, validate the SHA-256 sum of the Maven wrapper jar file
|
||||||
|
SET WRAPPER_SHA_256_SUM=""
|
||||||
|
FOR /F "usebackq tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO (
|
||||||
|
IF "%%A"=="wrapperSha256Sum" SET WRAPPER_SHA_256_SUM=%%B
|
||||||
|
)
|
||||||
|
IF NOT %WRAPPER_SHA_256_SUM%=="" (
|
||||||
|
powershell -Command "&{"^
|
||||||
|
"$hash = (Get-FileHash \"%WRAPPER_JAR%\" -Algorithm SHA256).Hash.ToLower();"^
|
||||||
|
"If('%WRAPPER_SHA_256_SUM%' -ne $hash){"^
|
||||||
|
" Write-Output 'Error: Failed to validate Maven wrapper SHA-256, your Maven wrapper might be compromised.';"^
|
||||||
|
" Write-Output 'Investigate or delete %WRAPPER_JAR% to attempt a clean download.';"^
|
||||||
|
" Write-Output 'If you updated your Maven version, you need to update the specified wrapperSha256Sum property.';"^
|
||||||
|
" exit 1;"^
|
||||||
|
"}"^
|
||||||
|
"}"
|
||||||
|
if ERRORLEVEL 1 goto error
|
||||||
|
)
|
||||||
|
|
||||||
|
@REM Provide a "standardized" way to retrieve the CLI args that will
|
||||||
|
@REM work with both Windows and non-Windows executions.
|
||||||
|
set MAVEN_CMD_LINE_ARGS=%*
|
||||||
|
|
||||||
|
%MAVEN_JAVA_EXE% ^
|
||||||
|
%JVM_CONFIG_MAVEN_PROPS% ^
|
||||||
|
%MAVEN_OPTS% ^
|
||||||
|
%MAVEN_DEBUG_OPTS% ^
|
||||||
|
-classpath %WRAPPER_JAR% ^
|
||||||
|
"-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" ^
|
||||||
|
%WRAPPER_LAUNCHER% %MAVEN_CONFIG% %*
|
||||||
|
if ERRORLEVEL 1 goto error
|
||||||
|
goto end
|
||||||
|
|
||||||
|
:error
|
||||||
|
set ERROR_CODE=1
|
||||||
|
|
||||||
|
:end
|
||||||
|
@endlocal & set ERROR_CODE=%ERROR_CODE%
|
||||||
|
|
||||||
|
if not "%MAVEN_SKIP_RC%"=="" goto skipRcPost
|
||||||
|
@REM check for post script, once with legacy .bat ending and once with .cmd ending
|
||||||
|
if exist "%USERPROFILE%\mavenrc_post.bat" call "%USERPROFILE%\mavenrc_post.bat"
|
||||||
|
if exist "%USERPROFILE%\mavenrc_post.cmd" call "%USERPROFILE%\mavenrc_post.cmd"
|
||||||
|
:skipRcPost
|
||||||
|
|
||||||
|
@REM pause the script if MAVEN_BATCH_PAUSE is set to 'on'
|
||||||
|
if "%MAVEN_BATCH_PAUSE%"=="on" pause
|
||||||
|
|
||||||
|
if "%MAVEN_TERMINATE_CMD%"=="on" exit %ERROR_CODE%
|
||||||
|
|
||||||
|
cmd /C exit /B %ERROR_CODE%
|
|
@ -0,0 +1,69 @@
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<groupId>de.hsma</groupId>
|
||||||
|
<artifactId>spark-it</artifactId>
|
||||||
|
<version>0.0.1-SNAPSHOT</version>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<maven.compiler.target>11</maven.compiler.target>
|
||||||
|
<maven.compiler.source>11</maven.compiler.source>
|
||||||
|
<spark.version>3.5.1</spark.version>
|
||||||
|
<cassandra.version>3.5.0</cassandra.version>
|
||||||
|
<slf4j.version>1.7.36</slf4j.version>
|
||||||
|
<jackson.version>2.15.2</jackson.version>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-core_2.13</artifactId>
|
||||||
|
<version>${spark.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-sql_2.13</artifactId>
|
||||||
|
<version>${spark.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-streaming_2.13</artifactId>
|
||||||
|
<version>${spark.version}</version>
|
||||||
|
<!-- <scope>provided</scope> -->
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-mllib_2.13</artifactId>
|
||||||
|
<version>${spark.version}</version>
|
||||||
|
<!-- <scope>provided</scope> -->
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.datastax.spark</groupId>
|
||||||
|
<artifactId>spark-cassandra-connector_2.12</artifactId>
|
||||||
|
<version>${cassandra.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
|
<artifactId>jackson-core</artifactId>
|
||||||
|
<version>${jackson.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
|
<artifactId>jackson-databind</artifactId>
|
||||||
|
<version>${jackson.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.slf4j</groupId>
|
||||||
|
<artifactId>slf4j-log4j12</artifactId>
|
||||||
|
<version>${slf4j.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,25 @@
|
||||||
|
package de.hsma.informatik.bdea;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
|
||||||
|
public class BasicDataset {
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
SparkSession spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName("DSExample").master("local[4]")
|
||||||
|
.getOrCreate();
|
||||||
|
|
||||||
|
List<Integer> data = Arrays.asList(2, 3, 4, 6, 7, 8);
|
||||||
|
Dataset<Integer> ds = spark.createDataset(data, Encoders.INT());
|
||||||
|
|
||||||
|
ds.createOrReplaceTempView("numbers");
|
||||||
|
spark.sql("select * from numbers where value > 3").show();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,47 @@
|
||||||
|
package de.hsma.informatik.bdea;
|
||||||
|
|
||||||
|
import static org.apache.spark.sql.functions.col;
|
||||||
|
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
|
||||||
|
// https://github.com/apache/spark/blob/master/examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQLExample.java
|
||||||
|
|
||||||
|
public class BasicSQL {
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
SparkSession spark = SparkSession.builder()
|
||||||
|
.appName("Spark SQL examples").master("local[*]").getOrCreate();
|
||||||
|
|
||||||
|
Dataset<Row> data = spark.read().json("src/main/resources/data.json");
|
||||||
|
|
||||||
|
data.show();
|
||||||
|
data.printSchema();
|
||||||
|
|
||||||
|
Dataset<Row> fnameDS = data.select("fname");
|
||||||
|
fnameDS.show();
|
||||||
|
|
||||||
|
Dataset<Row> nameDS = data.select(col("fname"), col("lname"));
|
||||||
|
nameDS.show();
|
||||||
|
|
||||||
|
Dataset<Row> genderDS = data.groupBy("gender").count();
|
||||||
|
genderDS.show();
|
||||||
|
|
||||||
|
Dataset<Row> maleGenderDS = data.filter(col("gender").eqNullSafe("Male"));
|
||||||
|
maleGenderDS.show();
|
||||||
|
|
||||||
|
Dataset<Person> peopleDS = data.as(Encoders.bean(Person.class));
|
||||||
|
peopleDS.show();
|
||||||
|
|
||||||
|
System.out.println("Select...");
|
||||||
|
data.createOrReplaceTempView("People");
|
||||||
|
Dataset<Row> sqlDF = spark.sql("SELECT * FROM People WHERE lname like 'W%'");
|
||||||
|
sqlDF.show();
|
||||||
|
|
||||||
|
spark.close();
|
||||||
|
spark.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,117 @@
|
||||||
|
package de.hsma.informatik.bdea;
|
||||||
|
|
||||||
|
import static org.apache.spark.sql.functions.dayofyear;
|
||||||
|
import static org.apache.spark.sql.functions.month;
|
||||||
|
import static org.apache.spark.sql.functions.unix_timestamp;
|
||||||
|
import static org.apache.spark.sql.functions.year;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
|
||||||
|
// vgl.: https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/8599738367597028/4332292154849829/3601578643761083/latest.html
|
||||||
|
// Daten von: https://data.sfgov.org/Public-Safety/Fire-Department-Calls-for-Service/nuek-vuh3
|
||||||
|
// https://www.youtube.com/watch?v=K14plpZgy_c
|
||||||
|
// Update 24.04.24: copy is here: https://github.com/NashTech-Labs/Sparkathon/blob/master/src/main/resources/assignment/Fire_Department_Calls_for_Service.csv
|
||||||
|
public class FriscoFire {
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
SparkSession spark = SparkSession.builder()
|
||||||
|
.appName("Spark SQL examples").master("local[4]").getOrCreate();
|
||||||
|
|
||||||
|
Dataset<Row> data = spark.read().format("csv").option("header", true).option("inferSchema", true)
|
||||||
|
.load("/home/marcus/Downloads/Fire_Department_Calls_for_Service.csv");
|
||||||
|
|
||||||
|
// Mal schauen, was wir da geladen haben
|
||||||
|
data.printSchema();
|
||||||
|
data.limit(5).show();
|
||||||
|
|
||||||
|
// Spalten umbenennen (Leerzeichen und Bindestrich für spätere SQL-Abfragen entfernen)
|
||||||
|
for (String name : data.columns()) {
|
||||||
|
data = data.withColumnRenamed(name, name.replaceAll(" ", ""));
|
||||||
|
}
|
||||||
|
data = data.withColumnRenamed("Neighborhooods-AnalysisBoundaries", "Neighborhoods");
|
||||||
|
|
||||||
|
// // Spaltennamen ausgeben
|
||||||
|
Arrays.stream(data.columns()).forEach(System.out::println);
|
||||||
|
|
||||||
|
// // Anzahl der Zeilen
|
||||||
|
System.out.println(data.count());
|
||||||
|
//
|
||||||
|
// // Arten von Notfällen, do not truncate output
|
||||||
|
data.select("CallType").distinct().show(false);
|
||||||
|
|
||||||
|
// // Wie viele Anrufe welchen Typs gab es?
|
||||||
|
data.select("CallType").groupBy("CallType").count().orderBy("count").show(false);
|
||||||
|
|
||||||
|
// Datentypen anpassen: String zu Timestamp konvertieren
|
||||||
|
String fp1 = "MM/dd/yyyy";
|
||||||
|
String fp2 = "MM/dd/yyyy hh:mm:ss a";
|
||||||
|
|
||||||
|
data = data.withColumn("CallDateTS", unix_timestamp(data.col("CallDate"), fp1).cast("timestamp")).drop("CallDate");
|
||||||
|
data = data.withColumn("WatchDateTS", unix_timestamp(data.col("WatchDate"), fp1).cast("timestamp")).drop("WatchDate");
|
||||||
|
|
||||||
|
data = data.withColumn("ReceivedDtTmTS", unix_timestamp(data.col("ReceivedDtTm"), fp2).cast("timestamp")).drop("ReceivedDtTm");
|
||||||
|
data = data.withColumn("EntryDtTmTS", unix_timestamp(data.col("EntryDtTm"), fp2).cast("timestamp")).drop("EntryDtTm");
|
||||||
|
data = data.withColumn("DispatchDtTmTS", unix_timestamp(data.col("DispatchDtTm"), fp2).cast("timestamp")).drop("DispatchDtTm");
|
||||||
|
data = data.withColumn("ResponseDtTmTS", unix_timestamp(data.col("ResponseDtTm"), fp2).cast("timestamp")).drop("ResponseDtTm");
|
||||||
|
data = data.withColumn("OnSceneDtTmTS", unix_timestamp(data.col("OnSceneDtTm"), fp2).cast("timestamp")).drop("OnSceneDtTm");
|
||||||
|
data = data.withColumn("TransportDtTmTS", unix_timestamp(data.col("TransportDtTm"), fp2).cast("timestamp")).drop("TransportDtTm");
|
||||||
|
data = data.withColumn("HospitalDtTmTS", unix_timestamp(data.col("HospitalDtTm"), fp2).cast("timestamp")).drop("HospitalDtTm");
|
||||||
|
data = data.withColumn("AvailableDtTmTS", unix_timestamp(data.col("AvailableDtTm"), fp2).cast("timestamp")).drop("AvailableDtTm");
|
||||||
|
|
||||||
|
// neues Schema anschauen
|
||||||
|
data.printSchema();
|
||||||
|
data.limit(5).show();
|
||||||
|
|
||||||
|
// welche Anrufgründe gibt es?
|
||||||
|
data.select(data.col("CallType")).distinct().orderBy(year(data.col("CallDateTS"))).show(100, false);
|
||||||
|
|
||||||
|
// welche Jahre an Daten haben wir?
|
||||||
|
data.select(year(data.col("CallDateTS"))).distinct().orderBy(data.col("CallDateTS")).show();
|
||||||
|
|
||||||
|
// zeige die Anrufe aus 2018
|
||||||
|
data.filter(year(data.col("CallDateTS")).equalTo("2018")).show();
|
||||||
|
|
||||||
|
// wie viele Anrufe gab es im Juli 2017?
|
||||||
|
long calls = data.filter(year(data.col("CallDateTS")).equalTo("2017"))
|
||||||
|
.filter(month(data.col("CallDateTS")).equalTo("07")).count();
|
||||||
|
System.out.println(calls);
|
||||||
|
|
||||||
|
// wie viele Anrufe in 7 Tagen in 2016?
|
||||||
|
data.filter(year(data.col("CallDateTS")).equalTo("2016"))
|
||||||
|
.filter(dayofyear(data.col("CallDateTS")).$greater("179"))
|
||||||
|
.groupBy(dayofyear(data.col("CallDateTS"))).count().orderBy(dayofyear(data.col("CallDateTS"))).show(7);
|
||||||
|
|
||||||
|
data.repartition(4).createOrReplaceTempView("fireServiceVIEW");
|
||||||
|
spark.sql("SELECT DISTINCT Neighborhoods FROM fireServiceVIEW LIMIT 15").show();
|
||||||
|
|
||||||
|
// Cachen kostet erst einmal deutlich mehr Zeit...
|
||||||
|
// spark.catalog().cacheTable("fireServiceVIEW");
|
||||||
|
// System.out.println("----> Count: " + spark.table("fireServiceVIEW").count());
|
||||||
|
|
||||||
|
// // mit SQL: woher kamen 2015 die meisten Anrufe?
|
||||||
|
spark.sql("SELECT Neighborhoods, count(Neighborhoods) AS Neighborhood_Count FROM fireServiceVIEW "
|
||||||
|
+ "WHERE year(CallDateTS) == 2015 GROUP BY Neighborhoods ORDER BY Neighborhood_Count DESC LIMIT 15").show();
|
||||||
|
|
||||||
|
// Joinen... zweite Tabelle: https://data.sfgov.org/Public-Safety/Fire-Incidents/wr8u-xric
|
||||||
|
Dataset<Row> incidents = spark.read().format("csv").option("header", true).option("inferSchema", true)
|
||||||
|
.load("/home/marcus/Downloads/Fire_Incidents.csv").withColumnRenamed("Incident Number", "IncidentNumber").cache();
|
||||||
|
|
||||||
|
System.out.println(incidents.count());
|
||||||
|
incidents.show(3);
|
||||||
|
|
||||||
|
// Hier passiert der Join... kann einen Moment dauern
|
||||||
|
// s. auch: http://www.techburps.com/misc/apache-spark-dataset-joins-in-java/129
|
||||||
|
System.out.println("Joining...");
|
||||||
|
Dataset<Row> joined = data.join(incidents, data.col("IncidentNumber").equalTo(incidents.col("IncidentNumber")));
|
||||||
|
// Dataset<Row> joined = data.join(incidents, data.col("IncidentNumber").equalTo(incidents.col("IncidentNumber")), "fullouter");
|
||||||
|
joined.show(100);
|
||||||
|
|
||||||
|
|
||||||
|
System.out.println("Fin.");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
package de.hsma.informatik.bdea;
|
||||||
|
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
|
public class LineCount {
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
String pfad = "src/main/resources/Faust.txt";
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf()
|
||||||
|
.setAppName("LineCount").setMaster("local[*]");
|
||||||
|
JavaSparkContext sc = new JavaSparkContext(conf);
|
||||||
|
|
||||||
|
JavaRDD<String> text = sc.textFile(pfad);
|
||||||
|
long cnt = text.filter(line -> line.contains("Liebe")).count();
|
||||||
|
|
||||||
|
System.out.println(cnt + " Zeilen mit 'Liebe' in " + pfad);
|
||||||
|
sc.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,67 @@
|
||||||
|
package de.hsma.informatik.bdea;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||||
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
@JsonIgnoreProperties
|
||||||
|
public class Person implements Serializable {
|
||||||
|
|
||||||
|
@JsonProperty(value = "id")
|
||||||
|
private Long id;
|
||||||
|
|
||||||
|
@JsonProperty(value = "fname")
|
||||||
|
private String fname;
|
||||||
|
|
||||||
|
@JsonProperty(value = "lname")
|
||||||
|
private String lname;
|
||||||
|
|
||||||
|
@JsonProperty(value = "email")
|
||||||
|
private String email;
|
||||||
|
|
||||||
|
@JsonProperty(value = "gender")
|
||||||
|
private String gender;
|
||||||
|
|
||||||
|
public Long getId() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setId(Long id) {
|
||||||
|
this.id = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFname() {
|
||||||
|
return fname;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFname(String fname) {
|
||||||
|
this.fname = fname;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLname() {
|
||||||
|
return lname;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLname(String lname) {
|
||||||
|
this.lname = lname;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getEmail() {
|
||||||
|
return email;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEmail(String email) {
|
||||||
|
this.email = email;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getGender() {
|
||||||
|
return gender;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setGender(String gender) {
|
||||||
|
this.gender = gender;
|
||||||
|
}
|
||||||
|
|
||||||
|
//getters and setters
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
package de.hsma.informatik.bdea;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
|
public class PiExample {
|
||||||
|
private final static int NUM_SAMPLES = 10000000;
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
SparkConf conf = new SparkConf()
|
||||||
|
.setAppName("Getting-Started").setMaster("local[*]");
|
||||||
|
JavaSparkContext sc = new JavaSparkContext(conf);
|
||||||
|
|
||||||
|
List<Integer> l = new ArrayList<>(NUM_SAMPLES);
|
||||||
|
for (int i = 0; i < NUM_SAMPLES; i++) {
|
||||||
|
l.add(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
long count = sc.parallelize(l).filter(i -> {
|
||||||
|
double x = Math.random();
|
||||||
|
double y = Math.random();
|
||||||
|
return x*x + y*y <= 1;
|
||||||
|
}).count();
|
||||||
|
|
||||||
|
System.out.println("Pi is roughly " + 4.0 * count / NUM_SAMPLES);
|
||||||
|
|
||||||
|
sc.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,40 @@
|
||||||
|
package de.hsma.informatik.bdea;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
|
public class SimpleAdder {
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
SparkConf conf = new SparkConf().setAppName("SimpleAdder").setMaster("local[2]");
|
||||||
|
JavaSparkContext sc = new JavaSparkContext(conf);
|
||||||
|
|
||||||
|
Integer[] ints = {1,2,4,5,6,7,8,9};
|
||||||
|
JavaRDD<Integer> dist = sc.parallelize(Arrays.asList(ints));
|
||||||
|
|
||||||
|
dist.collect().forEach(System.out::println);
|
||||||
|
|
||||||
|
|
||||||
|
System.out.println(dist.reduce((x,y) -> x+y));
|
||||||
|
|
||||||
|
|
||||||
|
Integer[] others = {2,3,1,0, 0, 3, 4};
|
||||||
|
JavaRDD<Integer> dist2 = sc.parallelize(Arrays.asList(others));
|
||||||
|
// JavaRDD<Integer> res = dist.intersection(dist2);
|
||||||
|
// JavaPairRDD<Integer, Integer> res = dist.join(dist2);
|
||||||
|
// res.collect().forEach(System.out::println);
|
||||||
|
|
||||||
|
dist.intersection(dist2).collect().forEach(System.out::println);
|
||||||
|
|
||||||
|
dist2 = dist2.sortBy(x->x, true, 1);
|
||||||
|
|
||||||
|
dist2.sample(true, 0.5).collect().forEach(System.out::println);
|
||||||
|
// dist2.take(2).forEach(System.out::println);
|
||||||
|
dist2.collect().forEach(System.out::println);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,55 @@
|
||||||
|
package de.hsma.informatik.bdea;
|
||||||
|
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.apache.spark.sql.streaming.StreamingQuery;
|
||||||
|
import org.apache.spark.sql.streaming.StreamingQueryException;
|
||||||
|
import org.apache.spark.sql.streaming.Trigger;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.concurrent.TimeoutException;
|
||||||
|
|
||||||
|
// from https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
|
||||||
|
public class SparkStructuredStreaming {
|
||||||
|
|
||||||
|
public static void main(String[] args) throws InterruptedException, TimeoutException, StreamingQueryException {
|
||||||
|
SparkSession spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName("JavaStructuredNetworkWordCount")
|
||||||
|
.master("local[*]")
|
||||||
|
.getOrCreate();
|
||||||
|
|
||||||
|
// Folgendes benötigt einen lokalen Server auf Port 9999
|
||||||
|
// z.B. nc -lk 9999
|
||||||
|
|
||||||
|
// Create DataFrame representing the stream of input lines from connection to localhost:9999
|
||||||
|
Dataset<Row> lines = spark
|
||||||
|
.readStream()
|
||||||
|
.format("socket")
|
||||||
|
.option("host", "localhost")
|
||||||
|
.option("port", 9999)
|
||||||
|
.load();
|
||||||
|
|
||||||
|
// Split the lines into words
|
||||||
|
Dataset<String> words = lines
|
||||||
|
.as(Encoders.STRING())
|
||||||
|
.flatMap((FlatMapFunction<String, String>) x -> Arrays.asList(x.split(" ")).iterator(), Encoders.STRING());
|
||||||
|
|
||||||
|
// Generate running word count
|
||||||
|
Dataset<Row> wordCounts = words.groupBy("value").count();
|
||||||
|
|
||||||
|
// Start running the query that prints the running counts to the console
|
||||||
|
StreamingQuery query = wordCounts.writeStream()
|
||||||
|
.outputMode("complete")
|
||||||
|
.format("console")
|
||||||
|
// https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#triggers
|
||||||
|
.trigger(Trigger.ProcessingTime("5 seconds")) // interval
|
||||||
|
.start();
|
||||||
|
|
||||||
|
query.awaitTermination();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,74 @@
|
||||||
|
package de.hsma.informatik.bdea;
|
||||||
|
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.apache.spark.sql.streaming.StreamingQuery;
|
||||||
|
import org.apache.spark.sql.streaming.StreamingQueryException;
|
||||||
|
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.sql.Timestamp;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.TimeoutException;
|
||||||
|
|
||||||
|
// from https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
|
||||||
|
public class SparkStructuredStreamingWindowed {
|
||||||
|
|
||||||
|
public static void main(String[] args) throws InterruptedException, TimeoutException, StreamingQueryException {
|
||||||
|
SparkSession spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName("JavaStructuredNetworkWordCount")
|
||||||
|
.master("local[*]")
|
||||||
|
.getOrCreate();
|
||||||
|
|
||||||
|
int windowSize = 10;
|
||||||
|
int slideSize = 5;
|
||||||
|
|
||||||
|
// 'windowSize' Sekunden Windows
|
||||||
|
String windowDuration = windowSize + " seconds";
|
||||||
|
// werden alle 'slideSize' Sekunden aufgefrischt
|
||||||
|
String slideDuration = slideSize + " seconds";
|
||||||
|
|
||||||
|
// Folgendes benötigt einen lokalen Server auf Port 9999
|
||||||
|
// z.B. nc -lk 9999
|
||||||
|
|
||||||
|
// Create DataFrame representing the stream of input lines from connection to localhost:9999
|
||||||
|
Dataset<Row> lines = spark
|
||||||
|
.readStream()
|
||||||
|
.format("socket")
|
||||||
|
.option("host", "localhost")
|
||||||
|
.option("port", 9999)
|
||||||
|
.option("includeTimestamp", true) // Timestamps!
|
||||||
|
.load();
|
||||||
|
|
||||||
|
// Split the lines into words, retaining timestamps
|
||||||
|
Dataset<Row> words = lines
|
||||||
|
.as(Encoders.tuple(Encoders.STRING(), Encoders.TIMESTAMP()))
|
||||||
|
.flatMap((FlatMapFunction<Tuple2<String, Timestamp>, Tuple2<String, Timestamp>>) t -> {
|
||||||
|
List<Tuple2<String, Timestamp>> result = new ArrayList<>();
|
||||||
|
for (String word : t._1.split(" ")) {
|
||||||
|
result.add(new Tuple2<>(word, t._2));
|
||||||
|
}
|
||||||
|
return result.iterator();
|
||||||
|
},
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.TIMESTAMP())
|
||||||
|
).toDF("word", "timestamp");
|
||||||
|
|
||||||
|
// Group the data by window and word and compute the count of each group
|
||||||
|
Dataset<Row> windowedCounts = words.groupBy(
|
||||||
|
functions.window(words.col("timestamp"), windowDuration, slideDuration),
|
||||||
|
words.col("word")
|
||||||
|
).count().orderBy("window");
|
||||||
|
|
||||||
|
// Start running the query that prints the windowed word counts to the console
|
||||||
|
StreamingQuery query = windowedCounts.writeStream()
|
||||||
|
.outputMode("complete")
|
||||||
|
.format("console")
|
||||||
|
.option("truncate", "false")
|
||||||
|
.start();
|
||||||
|
|
||||||
|
query.awaitTermination();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
package de.hsma.informatik.bdea;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class WordCount {
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf().setAppName("wc").setMaster("local[4]");
|
||||||
|
JavaSparkContext sc = new JavaSparkContext(conf);
|
||||||
|
|
||||||
|
JavaRDD<String> tokens = sc.textFile("src/main/resources/Faust.txt").flatMap(
|
||||||
|
s -> Arrays.asList(s.split("\\W+")).iterator());
|
||||||
|
|
||||||
|
JavaPairRDD<String, Integer> counts = tokens.mapToPair(
|
||||||
|
token -> new Tuple2<>(token, 1)).reduceByKey((x,y) -> x+y);
|
||||||
|
|
||||||
|
List<Tuple2<String, Integer>> results = counts.collect();
|
||||||
|
results.forEach(System.out::println);
|
||||||
|
|
||||||
|
sc.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue