Zweiter versuch Wordcloud
parent
08ced3e693
commit
cd13864fb1
|
@ -0,0 +1,166 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>worldcloud.informatik</groupId>
|
||||
<artifactId>wordcloud.informatik.maven.eclipse</artifactId>
|
||||
<version>0.0.1-SNAPSHOT</version>
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<maven.compiler.source>1.8</maven.compiler.source>
|
||||
<maven.compiler.target>1.8</maven.compiler.target>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-ooxml</artifactId>
|
||||
<version>5.2.5</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox</artifactId>
|
||||
<version>3.0.4</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.servicemix.bundles</groupId>
|
||||
<artifactId>org.apache.servicemix.bundles.lucene-analyzers-common</artifactId>
|
||||
<version>8.11.1_1</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter</artifactId>
|
||||
<version>5.10.0</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-api</artifactId>
|
||||
<version>2.20.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-core</artifactId>
|
||||
<version>2.20.0</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
|
||||
<!-- Compiler -->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.13.0</version>
|
||||
<configuration>
|
||||
<source>${maven.compiler.source}</source>
|
||||
<target>${maven.compiler.target}</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<!-- JAR creation -->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-shade-plugin</artifactId>
|
||||
<version>3.6.0</version>
|
||||
<configuration>
|
||||
<createDependencyReducedPom>false</createDependencyReducedPom>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>shade</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<transformers>
|
||||
<transformer
|
||||
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
|
||||
<mainClass>de.hs_mannheim.informatik.wordcloud.main</mainClass>
|
||||
<mainClass>de.hs_mannheim.informatik.wordcloud.test</mainClass>
|
||||
</transformer>
|
||||
</transformers>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
<!-- Code coverage, cf.: target/site/jacoco -->
|
||||
<plugin>
|
||||
<groupId>org.jacoco</groupId>
|
||||
<artifactId>jacoco-maven-plugin</artifactId>
|
||||
<version>0.8.12</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>prepare-agent</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>report</id>
|
||||
<phase>test</phase>
|
||||
<goals>
|
||||
<goal>report</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
<!-- Static code analysis, cf: target/site/pmd.html -->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-pmd-plugin</artifactId>
|
||||
<version>3.26.0</version>
|
||||
<configuration>
|
||||
<failOnViolation>false</failOnViolation>
|
||||
<printFailingErrors>true</printFailingErrors>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>verify</phase>
|
||||
<goals>
|
||||
<goal>check</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
<reporting>
|
||||
<plugins>
|
||||
<!-- generate Javadocs via "mvn site" and find them in the sie folder-->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>3.11.2</version>
|
||||
<configuration>
|
||||
<show>private</show>
|
||||
<nohelp>true</nohelp>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-checkstyle-plugin</artifactId>
|
||||
<version>3.6.0</version>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
|
||||
</reporting>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,228 @@
|
|||
package de.hs_mannheim.informatik.wordcloud.main;
|
||||
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.de.GermanAnalyzer;
|
||||
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFShape;
|
||||
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
||||
import org.apache.poi.xslf.usermodel.XSLFTextShape;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||
|
||||
|
||||
|
||||
public class Filereading {
|
||||
|
||||
|
||||
private Map<String, Integer> words = new TreeMap<>();
|
||||
private static final Logger logger = LogManager.getLogger(Filereading.class);
|
||||
|
||||
|
||||
Filereading(String path, Language language){
|
||||
|
||||
if(path.endsWith(".pdf")) {
|
||||
String pdfText = pdfReading(path);
|
||||
if(pdfText != null) {
|
||||
textAnalyzis(pdfText, language);
|
||||
}
|
||||
}else if(path.endsWith(".docx")) {
|
||||
String text = reading(path);
|
||||
if(text != null) {
|
||||
textAnalyzis(text, language);
|
||||
}
|
||||
}else if(path.endsWith(".pptx")) {
|
||||
String text = pptReading(path);
|
||||
if(text != null) {
|
||||
textAnalyzis(text, language);
|
||||
}
|
||||
}else if(path.endsWith(".txt")) {
|
||||
String text =txtReading(path);
|
||||
if(text != null) {
|
||||
textAnalyzis(text, language);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
//1.Quelle
|
||||
public enum Language {
|
||||
ENGLISH {
|
||||
@Override
|
||||
public Analyzer getAnalyzer() {
|
||||
return new EnglishAnalyzer();
|
||||
}
|
||||
},
|
||||
GERMAN {
|
||||
@Override
|
||||
public Analyzer getAnalyzer() {
|
||||
return new GermanAnalyzer();
|
||||
}
|
||||
};
|
||||
|
||||
public abstract Analyzer getAnalyzer();
|
||||
}
|
||||
|
||||
|
||||
public Map<String, Integer> getWords() {
|
||||
return words;
|
||||
}
|
||||
|
||||
|
||||
public String txtReading(String path) {
|
||||
String filePath = path;
|
||||
try {
|
||||
// Lies den gesamten Inhalt der Datei in einen String
|
||||
String content = new String(Files.readAllBytes(Paths.get(path)));
|
||||
logger.info("Datei erfolgreich gelesen.");
|
||||
return content;
|
||||
} catch (IOException e) {
|
||||
logger.error("Fehler beim Lesen der Datei.", e);
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public String reading(String path) {
|
||||
|
||||
String filePath = path;
|
||||
logger.info("Datei wird gelesen: " + filePath);
|
||||
File file = new File(filePath);
|
||||
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
try (FileInputStream fis = new FileInputStream(file);
|
||||
XWPFDocument document = new XWPFDocument(fis)) {
|
||||
|
||||
List<XWPFParagraph> paragraphs = document.getParagraphs();
|
||||
for (XWPFParagraph para : paragraphs) {
|
||||
sb.append(para.getText()).append("\n");
|
||||
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
|
||||
} catch (IOException e) {
|
||||
logger.error("Fehler beim Öffnen der Word-Datei: " + file.getPath(), e);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
public String pdfReading(String path) {
|
||||
|
||||
String filePath = path;
|
||||
logger.info("Datei wird gelesen: " + filePath);
|
||||
File file = new File(filePath);
|
||||
|
||||
String text = " ";
|
||||
|
||||
try(PDDocument document = Loader.loadPDF(file)){
|
||||
PDFTextStripper pdfStripper = new PDFTextStripper();
|
||||
text = pdfStripper.getText(document);
|
||||
return text;
|
||||
}catch(Exception e) {
|
||||
logger.error("Fehler beim öffnen der Datei.", e);
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
//1. Quelle
|
||||
public String pptReading(String path) {
|
||||
StringBuilder text = new StringBuilder();
|
||||
logger.info("Datei wird gelesen: " + path);
|
||||
try (FileInputStream fis = new FileInputStream(path);
|
||||
XMLSlideShow ppt = new XMLSlideShow(fis)) {
|
||||
|
||||
for (XSLFSlide slide : ppt.getSlides()) {
|
||||
for (XSLFShape shape : slide.getShapes()) {
|
||||
if (shape instanceof XSLFTextShape) {
|
||||
XSLFTextShape textShape = (XSLFTextShape) shape;
|
||||
text.append(textShape.getText()).append("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
System.err.println("Fehler beim Lesen der PPTX-Datei:");
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public Map<String, Integer> textAnalyzis(String text, Language language) {
|
||||
|
||||
Map<String, Integer> textmap = new TreeMap<>();
|
||||
try (Analyzer analyzer = language.getAnalyzer()) {
|
||||
|
||||
TokenStream tokenStream = analyzer.tokenStream(null, text);
|
||||
CharTermAttribute termAttribute =
|
||||
tokenStream.addAttribute(CharTermAttribute.class);
|
||||
tokenStream.reset();
|
||||
while (tokenStream.incrementToken()) {
|
||||
String token = termAttribute.toString();
|
||||
|
||||
if(!token.matches(".*\\d.*") && token.length() > 2 && !token.matches("^[^a-zA-Z0-9].*")) {
|
||||
words.put(termAttribute.toString(), words.getOrDefault(termAttribute.toString(), 0)+1);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
logger.info("Es wurden "+words.size()+" Worte ausgelesen");
|
||||
tokenStream.close();
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
}
|
||||
return textmap;
|
||||
}
|
||||
|
||||
|
||||
public Map<String, Integer> getTopNWords(int n) {
|
||||
return words.entrySet()
|
||||
.stream()
|
||||
.sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue()))
|
||||
.limit(n)
|
||||
.collect(TreeMap::new,
|
||||
(m, e) -> m.put(e.getKey(), e.getValue()),
|
||||
TreeMap::putAll);//2. Quelle
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,156 @@
|
|||
package de.hs_mannheim.informatik.wordcloud.main;
|
||||
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
|
||||
|
||||
public class InsertWordcloudElements {
|
||||
private static final Logger logger = LogManager.getLogger(InsertWordcloudElements.class);
|
||||
private String search;
|
||||
|
||||
|
||||
InsertWordcloudElements(Map<String, Integer> cloudwords, ArrayList<String> nonoWords, int neededFreq, String search) {
|
||||
this.search = search;
|
||||
|
||||
if(!nonoWords.isEmpty()) {
|
||||
Map<String, Integer> filterMap = filter(cloudwords, nonoWords);
|
||||
enterWordcloudElements(filterMap, neededFreq);
|
||||
}else {
|
||||
enterWordcloudElements(cloudwords, neededFreq);
|
||||
}
|
||||
|
||||
createCSVFile();
|
||||
writeCSVFile(cloudwords);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public Map<String,Integer> filter(Map<String, Integer> words,ArrayList<String> badWords) {
|
||||
words.keySet().removeIf(badWords::contains);
|
||||
return words;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void createCSVFile() {
|
||||
try {
|
||||
File file = new File("src/test/resources/woerter.csv");
|
||||
if(file.createNewFile()) {
|
||||
logger.info("Die csv Datei wurde erstellt");
|
||||
}
|
||||
else {
|
||||
logger.info("csv datei exestiert bereits");
|
||||
}
|
||||
}catch(Exception e) {
|
||||
logger.error("Fehler beim erstellen der csv Datei.", e);
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void writeCSVFile(Map<String, Integer> words) {
|
||||
try {
|
||||
FileWriter write = new FileWriter("src/test/resources/woerter.csv");
|
||||
for(String word: words.keySet()) {
|
||||
write.write(word+", "+words.get(word)+",\n");
|
||||
}
|
||||
write.close();
|
||||
logger.info("csv datei wurde erfolgreich berschrieben.");
|
||||
}catch(Exception e) {
|
||||
logger.error("Datei konnte nicht beschrieben werden.");
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void enterWordcloudElements(Map<String, Integer> words, int neededFreq) {
|
||||
|
||||
int minFreq = Collections.min(words.values());
|
||||
int maxFreq = Collections.max(words.values());
|
||||
|
||||
String filepath = "site/wordcloud.html";
|
||||
|
||||
try {
|
||||
List<String> lines = Files.readAllLines(Paths.get(filepath));
|
||||
List<String> updateLines = new ArrayList<>();
|
||||
|
||||
boolean inOldSpanBlock = false;
|
||||
|
||||
for (String line : lines) {
|
||||
|
||||
|
||||
if (inOldSpanBlock) {
|
||||
if (line.contains("</div>")) {
|
||||
|
||||
updateLines.add(line);
|
||||
inOldSpanBlock = false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
updateLines.add(line);
|
||||
|
||||
if (line.contains("<!-- TODO: Hier die generierten Tags einsetzen -->")) {
|
||||
int idCounter = 0;
|
||||
|
||||
for (String key : words.keySet()) {
|
||||
if (words.get(key) < neededFreq) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String tagClass = getTagcloudClass(words.get(key), minFreq, maxFreq);
|
||||
|
||||
String word = "<span id=\""
|
||||
+ idCounter + "\" class=\"wrd "
|
||||
+ tagClass + "\"><a href=\"https://www."+search+".com/search?q="
|
||||
+ key + "\">"
|
||||
+ key + "</a></span>";
|
||||
|
||||
updateLines.add(word);
|
||||
idCounter++;
|
||||
}
|
||||
|
||||
|
||||
inOldSpanBlock = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Files.write(Paths.get(filepath), updateLines);
|
||||
|
||||
} catch (IOException e) {
|
||||
System.out.println("Fehler beim Lesen oder Schreiben der Datei.");
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public String getTagcloudClass(int frequency, int minFreq, int maxFreq) {
|
||||
if (maxFreq == minFreq) {
|
||||
return "tagcloud5";
|
||||
}
|
||||
|
||||
|
||||
int range = maxFreq - minFreq;
|
||||
int relativeValue = (int) Math.round(10.0 * (frequency - minFreq) / range);
|
||||
|
||||
return "tagcloud" + relativeValue;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
package de.hs_mannheim.informatik.wordcloud.main;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
public class main {
|
||||
|
||||
public static void main(String[] args) throws FileNotFoundException{
|
||||
new UserInterface();
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,205 @@
|
|||
package de.hs_mannheim.informatik.wordcloud.test;
|
||||
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.FileInputStream;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.de.GermanAnalyzer;
|
||||
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFShape;
|
||||
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
||||
import org.apache.poi.xslf.usermodel.XSLFTextShape;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||
|
||||
|
||||
|
||||
public class Filereading {
|
||||
|
||||
|
||||
private Map<String, Integer> words = new TreeMap<>();
|
||||
private static final Logger logger = LogManager.getLogger(Filereading.class);
|
||||
|
||||
|
||||
Filereading(String path, Language language){
|
||||
|
||||
if(path.endsWith(".pdf")) {
|
||||
String pdfText = pdfReading(path);
|
||||
if(pdfText != null) {
|
||||
textAnalyzis(pdfText, language);
|
||||
}
|
||||
}else if(path.endsWith(".docx")) {
|
||||
String text = reading(path);
|
||||
if(text != null) {
|
||||
textAnalyzis(text, language);
|
||||
}
|
||||
}else if(path.endsWith(".pptx")) {
|
||||
String text = pptReading(path);
|
||||
if(text != null) {
|
||||
textAnalyzis(text, language);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
public enum Language {
|
||||
ENGLISH {
|
||||
@Override
|
||||
public Analyzer getAnalyzer() {
|
||||
return new EnglishAnalyzer();
|
||||
}
|
||||
},
|
||||
GERMAN {
|
||||
@Override
|
||||
public Analyzer getAnalyzer() {
|
||||
return new GermanAnalyzer();
|
||||
}
|
||||
};
|
||||
|
||||
public abstract Analyzer getAnalyzer();
|
||||
}
|
||||
|
||||
|
||||
public Map<String, Integer> getWords() {
|
||||
return words;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
public String reading(String path) {
|
||||
|
||||
String filePath = path;
|
||||
logger.info("Datei wird gelesen: " + filePath);
|
||||
File file = new File(filePath);
|
||||
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
try (FileInputStream fis = new FileInputStream(file);
|
||||
XWPFDocument document = new XWPFDocument(fis)) {
|
||||
|
||||
List<XWPFParagraph> paragraphs = document.getParagraphs();
|
||||
for (XWPFParagraph para : paragraphs) {
|
||||
sb.append(para.getText()).append("\n");
|
||||
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
|
||||
} catch (IOException e) {
|
||||
logger.error("Fehler beim Öffnen der Word-Datei: " + file.getPath(), e);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
public String pdfReading(String path) {
|
||||
|
||||
String filePath = path;
|
||||
logger.info("Datei wird gelesen: " + filePath);
|
||||
File file = new File(filePath);
|
||||
|
||||
String text = " ";
|
||||
|
||||
try(PDDocument document = Loader.loadPDF(file)){
|
||||
PDFTextStripper pdfStripper = new PDFTextStripper();
|
||||
text = pdfStripper.getText(document);
|
||||
return text;
|
||||
}catch(Exception e) {
|
||||
logger.error("Fehler beim öffnen der Datei.", e);
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public String pptReading(String path) {
|
||||
StringBuilder text = new StringBuilder();
|
||||
logger.info("Datei wird gelesen: " + path);
|
||||
try (FileInputStream fis = new FileInputStream(path);
|
||||
XMLSlideShow ppt = new XMLSlideShow(fis)) {
|
||||
|
||||
for (XSLFSlide slide : ppt.getSlides()) {
|
||||
for (XSLFShape shape : slide.getShapes()) {
|
||||
if (shape instanceof XSLFTextShape) {
|
||||
XSLFTextShape textShape = (XSLFTextShape) shape;
|
||||
text.append(textShape.getText()).append("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
System.err.println("Fehler beim Lesen der PPTX-Datei:");
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public Map<String, Integer> textAnalyzis(String text, Language language) {
|
||||
|
||||
Map<String, Integer> textmap = new TreeMap<>();
|
||||
try (Analyzer analyzer = language.getAnalyzer()) {
|
||||
|
||||
TokenStream tokenStream = analyzer.tokenStream(null, text);
|
||||
CharTermAttribute termAttribute =
|
||||
tokenStream.addAttribute(CharTermAttribute.class);
|
||||
tokenStream.reset();
|
||||
while (tokenStream.incrementToken()) {
|
||||
String token = termAttribute.toString();
|
||||
|
||||
if(!token.matches(".*\\d.*") && token.length() > 2 && !token.matches("^[^a-zA-Z0-9].*")) {
|
||||
words.put(termAttribute.toString(), words.getOrDefault(termAttribute.toString(), 0)+1);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
logger.info("Es wurden "+words.size()+" Worte ausgelesen");
|
||||
tokenStream.close();
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
}
|
||||
return textmap;
|
||||
}
|
||||
|
||||
|
||||
public Map<String, Integer> getTopNWords(int n) {
|
||||
return words.entrySet()
|
||||
.stream()
|
||||
.sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue()))
|
||||
.limit(n)
|
||||
.collect(TreeMap::new,
|
||||
(m, e) -> m.put(e.getKey(), e.getValue()),
|
||||
TreeMap::putAll);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
package de.hs_mannheim.informatik.wordcloud.test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import de.hs_mannheim.informatik.wordcloud.test.Filereading.Language;
|
||||
|
||||
class FileReadingtest {
|
||||
|
||||
@Test
|
||||
public void testAnalyzeText() {
|
||||
|
||||
Language deLang = Language.GERMAN;
|
||||
Language enLang = Language.ENGLISH;
|
||||
|
||||
Filereading fileReading = new Filereading("src/test/resources/testCfile.pdf",deLang);
|
||||
Filereading docReading = new Filereading("src/test/resources/test.docx", deLang);
|
||||
Filereading pptxReading = new Filereading("src/test/resources/samplepptx.pptx", enLang);
|
||||
|
||||
|
||||
Map<String, Integer> words = fileReading.getWords();
|
||||
Map<String, Integer> docwords = docReading.getWords();
|
||||
Map<String, Integer> pptxwords = pptxReading.getWords();
|
||||
|
||||
assertEquals(4, words.get("welt"));
|
||||
assertEquals(4, docwords.get("hallo"));
|
||||
assertEquals(2, pptxwords.get("handout"));
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,156 @@
|
|||
package de.hs_mannheim.informatik.wordcloud.test;
|
||||
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
|
||||
|
||||
public class InsertWordcloudElements {
|
||||
private static final Logger logger = LogManager.getLogger(InsertWordcloudElements.class);
|
||||
private String search;
|
||||
|
||||
|
||||
InsertWordcloudElements(Map<String, Integer> cloudwords, ArrayList<String> nonoWords, int neededFreq, String search) {
|
||||
this.search = search;
|
||||
|
||||
if(!nonoWords.isEmpty()) {
|
||||
Map<String, Integer> filterMap = filter(cloudwords, nonoWords);
|
||||
enterWordcloudElements(filterMap, neededFreq);
|
||||
}else {
|
||||
enterWordcloudElements(cloudwords, neededFreq);
|
||||
}
|
||||
|
||||
createCSVFile();
|
||||
writeCSVFile(cloudwords);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public Map<String,Integer> filter(Map<String, Integer> words,ArrayList<String> badWords) {
|
||||
words.keySet().removeIf(badWords::contains);
|
||||
return words;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void createCSVFile() {
|
||||
try {
|
||||
File file = new File("src/test/resources/woerter.csv");
|
||||
if(file.createNewFile()) {
|
||||
logger.info("Die csv Datei wurde erstellt");
|
||||
}
|
||||
else {
|
||||
logger.info("csv datei exestiert bereits");
|
||||
}
|
||||
}catch(Exception e) {
|
||||
logger.error("Fehler beim erstellen der csv Datei.", e);
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void writeCSVFile(Map<String, Integer> words) {
|
||||
try {
|
||||
FileWriter write = new FileWriter("src/test/resources/woerter.csv");
|
||||
for(String word: words.keySet()) {
|
||||
write.write(word+", "+words.get(word)+",\n");
|
||||
}
|
||||
write.close();
|
||||
logger.info("csv datei wurde erfolgreich berschrieben.");
|
||||
}catch(Exception e) {
|
||||
logger.error("Datei konnte nicht beschrieben werden.");
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void enterWordcloudElements(Map<String, Integer> words, int neededFreq) {
|
||||
|
||||
int minFreq = Collections.min(words.values());
|
||||
int maxFreq = Collections.max(words.values());
|
||||
|
||||
String filepath = "site/wordcloud.html";
|
||||
|
||||
try {
|
||||
List<String> lines = Files.readAllLines(Paths.get(filepath));
|
||||
List<String> updateLines = new ArrayList<>();
|
||||
|
||||
boolean inOldSpanBlock = false;
|
||||
|
||||
for (String line : lines) {
|
||||
|
||||
|
||||
if (inOldSpanBlock) {
|
||||
if (line.contains("</div>")) {
|
||||
|
||||
updateLines.add(line);
|
||||
inOldSpanBlock = false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
updateLines.add(line);
|
||||
|
||||
if (line.contains("<!-- TODO: Hier die generierten Tags einsetzen -->")) {
|
||||
int idCounter = 0;
|
||||
|
||||
for (String key : words.keySet()) {
|
||||
if (words.get(key) < neededFreq) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String tagClass = getTagcloudClass(words.get(key), minFreq, maxFreq);
|
||||
|
||||
String word = "<span id=\""
|
||||
+ idCounter + "\" class=\"wrd "
|
||||
+ tagClass + "\"><a href=\"https://www."+search+".com/search?q="
|
||||
+ key + "\">"
|
||||
+ key + "</a></span>";
|
||||
|
||||
updateLines.add(word);
|
||||
idCounter++;
|
||||
}
|
||||
|
||||
|
||||
inOldSpanBlock = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Files.write(Paths.get(filepath), updateLines);
|
||||
|
||||
} catch (IOException e) {
|
||||
System.out.println("Fehler beim Lesen oder Schreiben der Datei.");
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public String getTagcloudClass(int frequency, int minFreq, int maxFreq) {
|
||||
if (maxFreq == minFreq) {
|
||||
return "tagcloud5";
|
||||
}
|
||||
|
||||
|
||||
int range = maxFreq - minFreq;
|
||||
int relativeValue = (int) Math.round(10.0 * (frequency - minFreq) / range);
|
||||
|
||||
return "tagcloud" + relativeValue;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
package de.hs_mannheim.informatik.wordcloud.test;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
public class test {
|
||||
|
||||
public static void main(String[] args) throws FileNotFoundException{
|
||||
new UserInterface();
|
||||
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue