Zweiter versuch Wordcloud

main
thomasmuller 2025-05-20 12:09:33 +02:00
parent 08ced3e693
commit cd13864fb1
8 changed files with 970 additions and 0 deletions

View File

@ -0,0 +1,166 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>worldcloud.informatik</groupId>
<artifactId>wordcloud.informatik.maven.eclipse</artifactId>
<version>0.0.1-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.5</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.4</version>
</dependency>
<dependency>
<groupId>org.apache.servicemix.bundles</groupId>
<artifactId>org.apache.servicemix.bundles.lucene-analyzers-common</artifactId>
<version>8.11.1_1</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>5.10.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.20.0</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.20.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- Compiler -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.13.0</version>
<configuration>
<source>${maven.compiler.source}</source>
<target>${maven.compiler.target}</target>
</configuration>
</plugin>
<!-- JAR creation -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<createDependencyReducedPom>false</createDependencyReducedPom>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>de.hs_mannheim.informatik.wordcloud.main</mainClass>
<mainClass>de.hs_mannheim.informatik.wordcloud.test</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<!-- Code coverage, cf.: target/site/jacoco -->
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<version>0.8.12</version>
<executions>
<execution>
<goals>
<goal>prepare-agent</goal>
</goals>
</execution>
<execution>
<id>report</id>
<phase>test</phase>
<goals>
<goal>report</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- Static code analysis, cf: target/site/pmd.html -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId>
<version>3.26.0</version>
<configuration>
<failOnViolation>false</failOnViolation>
<printFailingErrors>true</printFailingErrors>
</configuration>
<executions>
<execution>
<phase>verify</phase>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<reporting>
<plugins>
<!-- generate Javadocs via "mvn site" and find them in the sie folder-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>3.11.2</version>
<configuration>
<show>private</show>
<nohelp>true</nohelp>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-checkstyle-plugin</artifactId>
<version>3.6.0</version>
</plugin>
</plugins>
</reporting>
</project>

View File

@ -0,0 +1,228 @@
package de.hs_mannheim.informatik.wordcloud.main;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFTextShape;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
public class Filereading {
private Map<String, Integer> words = new TreeMap<>();
private static final Logger logger = LogManager.getLogger(Filereading.class);
Filereading(String path, Language language){
if(path.endsWith(".pdf")) {
String pdfText = pdfReading(path);
if(pdfText != null) {
textAnalyzis(pdfText, language);
}
}else if(path.endsWith(".docx")) {
String text = reading(path);
if(text != null) {
textAnalyzis(text, language);
}
}else if(path.endsWith(".pptx")) {
String text = pptReading(path);
if(text != null) {
textAnalyzis(text, language);
}
}else if(path.endsWith(".txt")) {
String text =txtReading(path);
if(text != null) {
textAnalyzis(text, language);
}
}
}
//1.Quelle
public enum Language {
ENGLISH {
@Override
public Analyzer getAnalyzer() {
return new EnglishAnalyzer();
}
},
GERMAN {
@Override
public Analyzer getAnalyzer() {
return new GermanAnalyzer();
}
};
public abstract Analyzer getAnalyzer();
}
public Map<String, Integer> getWords() {
return words;
}
public String txtReading(String path) {
String filePath = path;
try {
// Lies den gesamten Inhalt der Datei in einen String
String content = new String(Files.readAllBytes(Paths.get(path)));
logger.info("Datei erfolgreich gelesen.");
return content;
} catch (IOException e) {
logger.error("Fehler beim Lesen der Datei.", e);
return null;
}
}
public String reading(String path) {
String filePath = path;
logger.info("Datei wird gelesen: " + filePath);
File file = new File(filePath);
StringBuilder sb = new StringBuilder();
try (FileInputStream fis = new FileInputStream(file);
XWPFDocument document = new XWPFDocument(fis)) {
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph para : paragraphs) {
sb.append(para.getText()).append("\n");
}
return sb.toString();
} catch (IOException e) {
logger.error("Fehler beim Öffnen der Word-Datei: " + file.getPath(), e);
return null;
}
}
public String pdfReading(String path) {
String filePath = path;
logger.info("Datei wird gelesen: " + filePath);
File file = new File(filePath);
String text = " ";
try(PDDocument document = Loader.loadPDF(file)){
PDFTextStripper pdfStripper = new PDFTextStripper();
text = pdfStripper.getText(document);
return text;
}catch(Exception e) {
logger.error("Fehler beim öffnen der Datei.", e);
e.printStackTrace();
return null;
}
}
//1. Quelle
public String pptReading(String path) {
StringBuilder text = new StringBuilder();
logger.info("Datei wird gelesen: " + path);
try (FileInputStream fis = new FileInputStream(path);
XMLSlideShow ppt = new XMLSlideShow(fis)) {
for (XSLFSlide slide : ppt.getSlides()) {
for (XSLFShape shape : slide.getShapes()) {
if (shape instanceof XSLFTextShape) {
XSLFTextShape textShape = (XSLFTextShape) shape;
text.append(textShape.getText()).append("\n");
}
}
}
} catch (IOException e) {
System.err.println("Fehler beim Lesen der PPTX-Datei:");
e.printStackTrace();
}
return text.toString();
}
public Map<String, Integer> textAnalyzis(String text, Language language) {
Map<String, Integer> textmap = new TreeMap<>();
try (Analyzer analyzer = language.getAnalyzer()) {
TokenStream tokenStream = analyzer.tokenStream(null, text);
CharTermAttribute termAttribute =
tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
String token = termAttribute.toString();
if(!token.matches(".*\\d.*") && token.length() > 2 && !token.matches("^[^a-zA-Z0-9].*")) {
words.put(termAttribute.toString(), words.getOrDefault(termAttribute.toString(), 0)+1);
}
}
logger.info("Es wurden "+words.size()+" Worte ausgelesen");
tokenStream.close();
} catch (IOException e) {
e.printStackTrace();
return null;
}
return textmap;
}
public Map<String, Integer> getTopNWords(int n) {
return words.entrySet()
.stream()
.sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue()))
.limit(n)
.collect(TreeMap::new,
(m, e) -> m.put(e.getKey(), e.getValue()),
TreeMap::putAll);//2. Quelle
}
}

View File

@ -0,0 +1,156 @@
package de.hs_mannheim.informatik.wordcloud.main;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
public class InsertWordcloudElements {
private static final Logger logger = LogManager.getLogger(InsertWordcloudElements.class);
private String search;
InsertWordcloudElements(Map<String, Integer> cloudwords, ArrayList<String> nonoWords, int neededFreq, String search) {
this.search = search;
if(!nonoWords.isEmpty()) {
Map<String, Integer> filterMap = filter(cloudwords, nonoWords);
enterWordcloudElements(filterMap, neededFreq);
}else {
enterWordcloudElements(cloudwords, neededFreq);
}
createCSVFile();
writeCSVFile(cloudwords);
}
public Map<String,Integer> filter(Map<String, Integer> words,ArrayList<String> badWords) {
words.keySet().removeIf(badWords::contains);
return words;
}
public void createCSVFile() {
try {
File file = new File("src/test/resources/woerter.csv");
if(file.createNewFile()) {
logger.info("Die csv Datei wurde erstellt");
}
else {
logger.info("csv datei exestiert bereits");
}
}catch(Exception e) {
logger.error("Fehler beim erstellen der csv Datei.", e);
e.printStackTrace();
}
}
public void writeCSVFile(Map<String, Integer> words) {
try {
FileWriter write = new FileWriter("src/test/resources/woerter.csv");
for(String word: words.keySet()) {
write.write(word+", "+words.get(word)+",\n");
}
write.close();
logger.info("csv datei wurde erfolgreich berschrieben.");
}catch(Exception e) {
logger.error("Datei konnte nicht beschrieben werden.");
e.printStackTrace();
}
}
public void enterWordcloudElements(Map<String, Integer> words, int neededFreq) {
int minFreq = Collections.min(words.values());
int maxFreq = Collections.max(words.values());
String filepath = "site/wordcloud.html";
try {
List<String> lines = Files.readAllLines(Paths.get(filepath));
List<String> updateLines = new ArrayList<>();
boolean inOldSpanBlock = false;
for (String line : lines) {
if (inOldSpanBlock) {
if (line.contains("</div>")) {
updateLines.add(line);
inOldSpanBlock = false;
}
continue;
}
updateLines.add(line);
if (line.contains("<!-- TODO: Hier die generierten Tags einsetzen -->")) {
int idCounter = 0;
for (String key : words.keySet()) {
if (words.get(key) < neededFreq) {
continue;
}
String tagClass = getTagcloudClass(words.get(key), minFreq, maxFreq);
String word = "<span id=\""
+ idCounter + "\" class=\"wrd "
+ tagClass + "\"><a href=\"https://www."+search+".com/search?q="
+ key + "\">"
+ key + "</a></span>";
updateLines.add(word);
idCounter++;
}
inOldSpanBlock = true;
}
}
Files.write(Paths.get(filepath), updateLines);
} catch (IOException e) {
System.out.println("Fehler beim Lesen oder Schreiben der Datei.");
e.printStackTrace();
}
}
public String getTagcloudClass(int frequency, int minFreq, int maxFreq) {
if (maxFreq == minFreq) {
return "tagcloud5";
}
int range = maxFreq - minFreq;
int relativeValue = (int) Math.round(10.0 * (frequency - minFreq) / range);
return "tagcloud" + relativeValue;
}
}

View File

@ -0,0 +1,12 @@
package de.hs_mannheim.informatik.wordcloud.main;
import java.io.FileNotFoundException;
public class main {
public static void main(String[] args) throws FileNotFoundException{
new UserInterface();
}
}

View File

@ -0,0 +1,205 @@
package de.hs_mannheim.informatik.wordcloud.test;
import java.io.File;
import java.io.IOException;
import java.io.FileInputStream;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFTextShape;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
public class Filereading {
private Map<String, Integer> words = new TreeMap<>();
private static final Logger logger = LogManager.getLogger(Filereading.class);
Filereading(String path, Language language){
if(path.endsWith(".pdf")) {
String pdfText = pdfReading(path);
if(pdfText != null) {
textAnalyzis(pdfText, language);
}
}else if(path.endsWith(".docx")) {
String text = reading(path);
if(text != null) {
textAnalyzis(text, language);
}
}else if(path.endsWith(".pptx")) {
String text = pptReading(path);
if(text != null) {
textAnalyzis(text, language);
}
}
}
public enum Language {
ENGLISH {
@Override
public Analyzer getAnalyzer() {
return new EnglishAnalyzer();
}
},
GERMAN {
@Override
public Analyzer getAnalyzer() {
return new GermanAnalyzer();
}
};
public abstract Analyzer getAnalyzer();
}
public Map<String, Integer> getWords() {
return words;
}
public String reading(String path) {
String filePath = path;
logger.info("Datei wird gelesen: " + filePath);
File file = new File(filePath);
StringBuilder sb = new StringBuilder();
try (FileInputStream fis = new FileInputStream(file);
XWPFDocument document = new XWPFDocument(fis)) {
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph para : paragraphs) {
sb.append(para.getText()).append("\n");
}
return sb.toString();
} catch (IOException e) {
logger.error("Fehler beim Öffnen der Word-Datei: " + file.getPath(), e);
return null;
}
}
public String pdfReading(String path) {
String filePath = path;
logger.info("Datei wird gelesen: " + filePath);
File file = new File(filePath);
String text = " ";
try(PDDocument document = Loader.loadPDF(file)){
PDFTextStripper pdfStripper = new PDFTextStripper();
text = pdfStripper.getText(document);
return text;
}catch(Exception e) {
logger.error("Fehler beim öffnen der Datei.", e);
e.printStackTrace();
return null;
}
}
public String pptReading(String path) {
StringBuilder text = new StringBuilder();
logger.info("Datei wird gelesen: " + path);
try (FileInputStream fis = new FileInputStream(path);
XMLSlideShow ppt = new XMLSlideShow(fis)) {
for (XSLFSlide slide : ppt.getSlides()) {
for (XSLFShape shape : slide.getShapes()) {
if (shape instanceof XSLFTextShape) {
XSLFTextShape textShape = (XSLFTextShape) shape;
text.append(textShape.getText()).append("\n");
}
}
}
} catch (IOException e) {
System.err.println("Fehler beim Lesen der PPTX-Datei:");
e.printStackTrace();
}
return text.toString();
}
public Map<String, Integer> textAnalyzis(String text, Language language) {
Map<String, Integer> textmap = new TreeMap<>();
try (Analyzer analyzer = language.getAnalyzer()) {
TokenStream tokenStream = analyzer.tokenStream(null, text);
CharTermAttribute termAttribute =
tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
String token = termAttribute.toString();
if(!token.matches(".*\\d.*") && token.length() > 2 && !token.matches("^[^a-zA-Z0-9].*")) {
words.put(termAttribute.toString(), words.getOrDefault(termAttribute.toString(), 0)+1);
}
}
logger.info("Es wurden "+words.size()+" Worte ausgelesen");
tokenStream.close();
} catch (IOException e) {
e.printStackTrace();
return null;
}
return textmap;
}
public Map<String, Integer> getTopNWords(int n) {
return words.entrySet()
.stream()
.sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue()))
.limit(n)
.collect(TreeMap::new,
(m, e) -> m.put(e.getKey(), e.getValue()),
TreeMap::putAll);
}
}

View File

@ -0,0 +1,35 @@
package de.hs_mannheim.informatik.wordcloud.test;
import static org.junit.jupiter.api.Assertions.*;
import java.util.Map;
import org.junit.jupiter.api.Test;
import de.hs_mannheim.informatik.wordcloud.test.Filereading.Language;
class FileReadingtest {
@Test
public void testAnalyzeText() {
Language deLang = Language.GERMAN;
Language enLang = Language.ENGLISH;
Filereading fileReading = new Filereading("src/test/resources/testCfile.pdf",deLang);
Filereading docReading = new Filereading("src/test/resources/test.docx", deLang);
Filereading pptxReading = new Filereading("src/test/resources/samplepptx.pptx", enLang);
Map<String, Integer> words = fileReading.getWords();
Map<String, Integer> docwords = docReading.getWords();
Map<String, Integer> pptxwords = pptxReading.getWords();
assertEquals(4, words.get("welt"));
assertEquals(4, docwords.get("hallo"));
assertEquals(2, pptxwords.get("handout"));
}
}

View File

@ -0,0 +1,156 @@
package de.hs_mannheim.informatik.wordcloud.test;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
public class InsertWordcloudElements {
private static final Logger logger = LogManager.getLogger(InsertWordcloudElements.class);
private String search;
InsertWordcloudElements(Map<String, Integer> cloudwords, ArrayList<String> nonoWords, int neededFreq, String search) {
this.search = search;
if(!nonoWords.isEmpty()) {
Map<String, Integer> filterMap = filter(cloudwords, nonoWords);
enterWordcloudElements(filterMap, neededFreq);
}else {
enterWordcloudElements(cloudwords, neededFreq);
}
createCSVFile();
writeCSVFile(cloudwords);
}
public Map<String,Integer> filter(Map<String, Integer> words,ArrayList<String> badWords) {
words.keySet().removeIf(badWords::contains);
return words;
}
public void createCSVFile() {
try {
File file = new File("src/test/resources/woerter.csv");
if(file.createNewFile()) {
logger.info("Die csv Datei wurde erstellt");
}
else {
logger.info("csv datei exestiert bereits");
}
}catch(Exception e) {
logger.error("Fehler beim erstellen der csv Datei.", e);
e.printStackTrace();
}
}
public void writeCSVFile(Map<String, Integer> words) {
try {
FileWriter write = new FileWriter("src/test/resources/woerter.csv");
for(String word: words.keySet()) {
write.write(word+", "+words.get(word)+",\n");
}
write.close();
logger.info("csv datei wurde erfolgreich berschrieben.");
}catch(Exception e) {
logger.error("Datei konnte nicht beschrieben werden.");
e.printStackTrace();
}
}
public void enterWordcloudElements(Map<String, Integer> words, int neededFreq) {
int minFreq = Collections.min(words.values());
int maxFreq = Collections.max(words.values());
String filepath = "site/wordcloud.html";
try {
List<String> lines = Files.readAllLines(Paths.get(filepath));
List<String> updateLines = new ArrayList<>();
boolean inOldSpanBlock = false;
for (String line : lines) {
if (inOldSpanBlock) {
if (line.contains("</div>")) {
updateLines.add(line);
inOldSpanBlock = false;
}
continue;
}
updateLines.add(line);
if (line.contains("<!-- TODO: Hier die generierten Tags einsetzen -->")) {
int idCounter = 0;
for (String key : words.keySet()) {
if (words.get(key) < neededFreq) {
continue;
}
String tagClass = getTagcloudClass(words.get(key), minFreq, maxFreq);
String word = "<span id=\""
+ idCounter + "\" class=\"wrd "
+ tagClass + "\"><a href=\"https://www."+search+".com/search?q="
+ key + "\">"
+ key + "</a></span>";
updateLines.add(word);
idCounter++;
}
inOldSpanBlock = true;
}
}
Files.write(Paths.get(filepath), updateLines);
} catch (IOException e) {
System.out.println("Fehler beim Lesen oder Schreiben der Datei.");
e.printStackTrace();
}
}
public String getTagcloudClass(int frequency, int minFreq, int maxFreq) {
if (maxFreq == minFreq) {
return "tagcloud5";
}
int range = maxFreq - minFreq;
int relativeValue = (int) Math.round(10.0 * (frequency - minFreq) / range);
return "tagcloud" + relativeValue;
}
}

View File

@ -0,0 +1,12 @@
package de.hs_mannheim.informatik.wordcloud.test;
import java.io.FileNotFoundException;
public class test {
public static void main(String[] args) throws FileNotFoundException{
new UserInterface();
}
}