diff --git a/de.hs-mannheim.informatik.wordcloud/pom.xml b/de.hs-mannheim.informatik.wordcloud/pom.xml new file mode 100644 index 0000000..546e71f --- /dev/null +++ b/de.hs-mannheim.informatik.wordcloud/pom.xml @@ -0,0 +1,166 @@ + + 4.0.0 + worldcloud.informatik + wordcloud.informatik.maven.eclipse + 0.0.1-SNAPSHOT + + + UTF-8 + 1.8 + 1.8 + + + + + org.apache.poi + poi-ooxml + 5.2.5 + + + + + org.apache.pdfbox + pdfbox + 3.0.4 + + + + org.apache.servicemix.bundles + org.apache.servicemix.bundles.lucene-analyzers-common + 8.11.1_1 + + + + + org.junit.jupiter + junit-jupiter + 5.10.0 + test + + + + org.apache.logging.log4j + log4j-api + 2.20.0 + + + + org.apache.logging.log4j + log4j-core + 2.20.0 + + + + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + ${maven.compiler.source} + ${maven.compiler.target} + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.6.0 + + false + + + + package + + shade + + + + + de.hs_mannheim.informatik.wordcloud.main + de.hs_mannheim.informatik.wordcloud.test + + + + + + + + + + org.jacoco + jacoco-maven-plugin + 0.8.12 + + + + prepare-agent + + + + report + test + + report + + + + + + + + org.apache.maven.plugins + maven-pmd-plugin + 3.26.0 + + false + true + + + + verify + + check + + + + + + + + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.11.2 + + private + true + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + 3.6.0 + + + + + + + \ No newline at end of file diff --git a/de.hs-mannheim.informatik.wordcloud/src/main/java/de/hs_mannheim/informatik/wordcloud/main/Filereading.java b/de.hs-mannheim.informatik.wordcloud/src/main/java/de/hs_mannheim/informatik/wordcloud/main/Filereading.java new file mode 100644 index 0000000..337bf42 --- /dev/null +++ b/de.hs-mannheim.informatik.wordcloud/src/main/java/de/hs_mannheim/informatik/wordcloud/main/Filereading.java @@ -0,0 +1,228 @@ +package de.hs_mannheim.informatik.wordcloud.main; + + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.Map; + +import java.util.TreeMap; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.de.GermanAnalyzer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xslf.usermodel.XSLFShape; +import org.apache.poi.xslf.usermodel.XSLFSlide; +import org.apache.poi.xslf.usermodel.XSLFTextShape; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; + + + +public class Filereading { + + + private Map words = new TreeMap<>(); + private static final Logger logger = LogManager.getLogger(Filereading.class); + + + Filereading(String path, Language language){ + + if(path.endsWith(".pdf")) { + String pdfText = pdfReading(path); + if(pdfText != null) { + textAnalyzis(pdfText, language); + } + }else if(path.endsWith(".docx")) { + String text = reading(path); + if(text != null) { + textAnalyzis(text, language); + } + }else if(path.endsWith(".pptx")) { + String text = pptReading(path); + if(text != null) { + textAnalyzis(text, language); + } + }else if(path.endsWith(".txt")) { + String text =txtReading(path); + if(text != null) { + textAnalyzis(text, language); + } + } + + + + } + + //1.Quelle + public enum Language { + ENGLISH { + @Override + public Analyzer getAnalyzer() { + return new EnglishAnalyzer(); + } + }, + GERMAN { + @Override + public Analyzer getAnalyzer() { + return new GermanAnalyzer(); + } + }; + + public abstract Analyzer getAnalyzer(); + } + + + public Map getWords() { + return words; + } + + + public String txtReading(String path) { + String filePath = path; + try { + // Lies den gesamten Inhalt der Datei in einen String + String content = new String(Files.readAllBytes(Paths.get(path))); + logger.info("Datei erfolgreich gelesen."); + return content; + } catch (IOException e) { + logger.error("Fehler beim Lesen der Datei.", e); + return null; + } + + } + + + + + public String reading(String path) { + + String filePath = path; + logger.info("Datei wird gelesen: " + filePath); + File file = new File(filePath); + + + StringBuilder sb = new StringBuilder(); + + try (FileInputStream fis = new FileInputStream(file); + XWPFDocument document = new XWPFDocument(fis)) { + + List paragraphs = document.getParagraphs(); + for (XWPFParagraph para : paragraphs) { + sb.append(para.getText()).append("\n"); + + } + + return sb.toString(); + + } catch (IOException e) { + logger.error("Fehler beim Öffnen der Word-Datei: " + file.getPath(), e); + return null; + } + + + } + + + public String pdfReading(String path) { + + String filePath = path; + logger.info("Datei wird gelesen: " + filePath); + File file = new File(filePath); + + String text = " "; + + try(PDDocument document = Loader.loadPDF(file)){ + PDFTextStripper pdfStripper = new PDFTextStripper(); + text = pdfStripper.getText(document); + return text; + }catch(Exception e) { + logger.error("Fehler beim öffnen der Datei.", e); + e.printStackTrace(); + return null; + } + } + + //1. Quelle + public String pptReading(String path) { + StringBuilder text = new StringBuilder(); + logger.info("Datei wird gelesen: " + path); + try (FileInputStream fis = new FileInputStream(path); + XMLSlideShow ppt = new XMLSlideShow(fis)) { + + for (XSLFSlide slide : ppt.getSlides()) { + for (XSLFShape shape : slide.getShapes()) { + if (shape instanceof XSLFTextShape) { + XSLFTextShape textShape = (XSLFTextShape) shape; + text.append(textShape.getText()).append("\n"); + } + } + } + + } catch (IOException e) { + System.err.println("Fehler beim Lesen der PPTX-Datei:"); + e.printStackTrace(); + } + + return text.toString(); + } + + + + public Map textAnalyzis(String text, Language language) { + + Map textmap = new TreeMap<>(); + try (Analyzer analyzer = language.getAnalyzer()) { + + TokenStream tokenStream = analyzer.tokenStream(null, text); + CharTermAttribute termAttribute = + tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + while (tokenStream.incrementToken()) { + String token = termAttribute.toString(); + + if(!token.matches(".*\\d.*") && token.length() > 2 && !token.matches("^[^a-zA-Z0-9].*")) { + words.put(termAttribute.toString(), words.getOrDefault(termAttribute.toString(), 0)+1); + } + + + } + logger.info("Es wurden "+words.size()+" Worte ausgelesen"); + tokenStream.close(); + + } catch (IOException e) { + e.printStackTrace(); + return null; + } + return textmap; + } + + + public Map getTopNWords(int n) { + return words.entrySet() + .stream() + .sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue())) + .limit(n) + .collect(TreeMap::new, + (m, e) -> m.put(e.getKey(), e.getValue()), + TreeMap::putAll);//2. Quelle + } + + +} + + + + diff --git a/de.hs-mannheim.informatik.wordcloud/src/main/java/de/hs_mannheim/informatik/wordcloud/main/InsertWordcloudElements.java b/de.hs-mannheim.informatik.wordcloud/src/main/java/de/hs_mannheim/informatik/wordcloud/main/InsertWordcloudElements.java new file mode 100644 index 0000000..bcb7ced --- /dev/null +++ b/de.hs-mannheim.informatik.wordcloud/src/main/java/de/hs_mannheim/informatik/wordcloud/main/InsertWordcloudElements.java @@ -0,0 +1,156 @@ +package de.hs_mannheim.informatik.wordcloud.main; + + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + + + + public class InsertWordcloudElements { + private static final Logger logger = LogManager.getLogger(InsertWordcloudElements.class); + private String search; + + + InsertWordcloudElements(Map cloudwords, ArrayList nonoWords, int neededFreq, String search) { + this.search = search; + + if(!nonoWords.isEmpty()) { + Map filterMap = filter(cloudwords, nonoWords); + enterWordcloudElements(filterMap, neededFreq); + }else { + enterWordcloudElements(cloudwords, neededFreq); + } + + createCSVFile(); + writeCSVFile(cloudwords); + } + + + + public Map filter(Map words,ArrayList badWords) { + words.keySet().removeIf(badWords::contains); + return words; + } + + + + public void createCSVFile() { + try { + File file = new File("src/test/resources/woerter.csv"); + if(file.createNewFile()) { + logger.info("Die csv Datei wurde erstellt"); + } + else { + logger.info("csv datei exestiert bereits"); + } + }catch(Exception e) { + logger.error("Fehler beim erstellen der csv Datei.", e); + e.printStackTrace(); + } + } + + + public void writeCSVFile(Map words) { + try { + FileWriter write = new FileWriter("src/test/resources/woerter.csv"); + for(String word: words.keySet()) { + write.write(word+", "+words.get(word)+",\n"); + } + write.close(); + logger.info("csv datei wurde erfolgreich berschrieben."); + }catch(Exception e) { + logger.error("Datei konnte nicht beschrieben werden."); + e.printStackTrace(); + } + } + + + + public void enterWordcloudElements(Map words, int neededFreq) { + + int minFreq = Collections.min(words.values()); + int maxFreq = Collections.max(words.values()); + + String filepath = "site/wordcloud.html"; + + try { + List lines = Files.readAllLines(Paths.get(filepath)); + List updateLines = new ArrayList<>(); + + boolean inOldSpanBlock = false; + + for (String line : lines) { + + + if (inOldSpanBlock) { + if (line.contains("")) { + + updateLines.add(line); + inOldSpanBlock = false; + } + continue; + } + + + updateLines.add(line); + + if (line.contains("")) { + int idCounter = 0; + + for (String key : words.keySet()) { + if (words.get(key) < neededFreq) { + continue; + } + + String tagClass = getTagcloudClass(words.get(key), minFreq, maxFreq); + + String word = "" + + key + ""; + + updateLines.add(word); + idCounter++; + } + + + inOldSpanBlock = true; + } + } + + + Files.write(Paths.get(filepath), updateLines); + + } catch (IOException e) { + System.out.println("Fehler beim Lesen oder Schreiben der Datei."); + e.printStackTrace(); + } + } + + + public String getTagcloudClass(int frequency, int minFreq, int maxFreq) { + if (maxFreq == minFreq) { + return "tagcloud5"; + } + + + int range = maxFreq - minFreq; + int relativeValue = (int) Math.round(10.0 * (frequency - minFreq) / range); + + return "tagcloud" + relativeValue; + } + + + } + diff --git a/de.hs-mannheim.informatik.wordcloud/src/main/java/de/hs_mannheim/informatik/wordcloud/main/main.java b/de.hs-mannheim.informatik.wordcloud/src/main/java/de/hs_mannheim/informatik/wordcloud/main/main.java new file mode 100644 index 0000000..b6fe399 --- /dev/null +++ b/de.hs-mannheim.informatik.wordcloud/src/main/java/de/hs_mannheim/informatik/wordcloud/main/main.java @@ -0,0 +1,12 @@ +package de.hs_mannheim.informatik.wordcloud.main; + +import java.io.FileNotFoundException; + +public class main { + + public static void main(String[] args) throws FileNotFoundException{ + new UserInterface(); + + } + +} diff --git a/de.hs-mannheim.informatik.wordcloud/src/test/java/de/hs_mannheim/informatik/wordcloud/test/Filereading.java b/de.hs-mannheim.informatik.wordcloud/src/test/java/de/hs_mannheim/informatik/wordcloud/test/Filereading.java new file mode 100644 index 0000000..cb8fd3c --- /dev/null +++ b/de.hs-mannheim.informatik.wordcloud/src/test/java/de/hs_mannheim/informatik/wordcloud/test/Filereading.java @@ -0,0 +1,205 @@ +package de.hs_mannheim.informatik.wordcloud.test; + + +import java.io.File; +import java.io.IOException; +import java.io.FileInputStream; +import java.util.List; +import java.util.Map; + +import java.util.TreeMap; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.de.GermanAnalyzer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xslf.usermodel.XSLFShape; +import org.apache.poi.xslf.usermodel.XSLFSlide; +import org.apache.poi.xslf.usermodel.XSLFTextShape; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; + + + +public class Filereading { + + + private Map words = new TreeMap<>(); + private static final Logger logger = LogManager.getLogger(Filereading.class); + + + Filereading(String path, Language language){ + + if(path.endsWith(".pdf")) { + String pdfText = pdfReading(path); + if(pdfText != null) { + textAnalyzis(pdfText, language); + } + }else if(path.endsWith(".docx")) { + String text = reading(path); + if(text != null) { + textAnalyzis(text, language); + } + }else if(path.endsWith(".pptx")) { + String text = pptReading(path); + if(text != null) { + textAnalyzis(text, language); + } + } + + + + + } + + public enum Language { + ENGLISH { + @Override + public Analyzer getAnalyzer() { + return new EnglishAnalyzer(); + } + }, + GERMAN { + @Override + public Analyzer getAnalyzer() { + return new GermanAnalyzer(); + } + }; + + public abstract Analyzer getAnalyzer(); + } + + + public Map getWords() { + return words; + } + + + + + + public String reading(String path) { + + String filePath = path; + logger.info("Datei wird gelesen: " + filePath); + File file = new File(filePath); + + + StringBuilder sb = new StringBuilder(); + + try (FileInputStream fis = new FileInputStream(file); + XWPFDocument document = new XWPFDocument(fis)) { + + List paragraphs = document.getParagraphs(); + for (XWPFParagraph para : paragraphs) { + sb.append(para.getText()).append("\n"); + + } + + return sb.toString(); + + } catch (IOException e) { + logger.error("Fehler beim Öffnen der Word-Datei: " + file.getPath(), e); + return null; + } + + + } + + + public String pdfReading(String path) { + + String filePath = path; + logger.info("Datei wird gelesen: " + filePath); + File file = new File(filePath); + + String text = " "; + + try(PDDocument document = Loader.loadPDF(file)){ + PDFTextStripper pdfStripper = new PDFTextStripper(); + text = pdfStripper.getText(document); + return text; + }catch(Exception e) { + logger.error("Fehler beim öffnen der Datei.", e); + e.printStackTrace(); + return null; + } + } + + + public String pptReading(String path) { + StringBuilder text = new StringBuilder(); + logger.info("Datei wird gelesen: " + path); + try (FileInputStream fis = new FileInputStream(path); + XMLSlideShow ppt = new XMLSlideShow(fis)) { + + for (XSLFSlide slide : ppt.getSlides()) { + for (XSLFShape shape : slide.getShapes()) { + if (shape instanceof XSLFTextShape) { + XSLFTextShape textShape = (XSLFTextShape) shape; + text.append(textShape.getText()).append("\n"); + } + } + } + + } catch (IOException e) { + System.err.println("Fehler beim Lesen der PPTX-Datei:"); + e.printStackTrace(); + } + + return text.toString(); + } + + + + public Map textAnalyzis(String text, Language language) { + + Map textmap = new TreeMap<>(); + try (Analyzer analyzer = language.getAnalyzer()) { + + TokenStream tokenStream = analyzer.tokenStream(null, text); + CharTermAttribute termAttribute = + tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + while (tokenStream.incrementToken()) { + String token = termAttribute.toString(); + + if(!token.matches(".*\\d.*") && token.length() > 2 && !token.matches("^[^a-zA-Z0-9].*")) { + words.put(termAttribute.toString(), words.getOrDefault(termAttribute.toString(), 0)+1); + } + + + } + logger.info("Es wurden "+words.size()+" Worte ausgelesen"); + tokenStream.close(); + + } catch (IOException e) { + e.printStackTrace(); + return null; + } + return textmap; + } + + + public Map getTopNWords(int n) { + return words.entrySet() + .stream() + .sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue())) + .limit(n) + .collect(TreeMap::new, + (m, e) -> m.put(e.getKey(), e.getValue()), + TreeMap::putAll); + } + + +} + + + + diff --git a/de.hs-mannheim.informatik.wordcloud/src/test/java/de/hs_mannheim/informatik/wordcloud/test/Filereadingtest.java b/de.hs-mannheim.informatik.wordcloud/src/test/java/de/hs_mannheim/informatik/wordcloud/test/Filereadingtest.java new file mode 100644 index 0000000..e76daa9 --- /dev/null +++ b/de.hs-mannheim.informatik.wordcloud/src/test/java/de/hs_mannheim/informatik/wordcloud/test/Filereadingtest.java @@ -0,0 +1,35 @@ +package de.hs_mannheim.informatik.wordcloud.test; + +import static org.junit.jupiter.api.Assertions.*; + +import java.util.Map; + +import org.junit.jupiter.api.Test; + +import de.hs_mannheim.informatik.wordcloud.test.Filereading.Language; + +class FileReadingtest { + + @Test + public void testAnalyzeText() { + + Language deLang = Language.GERMAN; + Language enLang = Language.ENGLISH; + + Filereading fileReading = new Filereading("src/test/resources/testCfile.pdf",deLang); + Filereading docReading = new Filereading("src/test/resources/test.docx", deLang); + Filereading pptxReading = new Filereading("src/test/resources/samplepptx.pptx", enLang); + + + Map words = fileReading.getWords(); + Map docwords = docReading.getWords(); + Map pptxwords = pptxReading.getWords(); + + assertEquals(4, words.get("welt")); + assertEquals(4, docwords.get("hallo")); + assertEquals(2, pptxwords.get("handout")); + + + } + +} diff --git a/de.hs-mannheim.informatik.wordcloud/src/test/java/de/hs_mannheim/informatik/wordcloud/test/InsertWordcloudElements.java b/de.hs-mannheim.informatik.wordcloud/src/test/java/de/hs_mannheim/informatik/wordcloud/test/InsertWordcloudElements.java new file mode 100644 index 0000000..7fb62c0 --- /dev/null +++ b/de.hs-mannheim.informatik.wordcloud/src/test/java/de/hs_mannheim/informatik/wordcloud/test/InsertWordcloudElements.java @@ -0,0 +1,156 @@ +package de.hs_mannheim.informatik.wordcloud.test; + + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + + + + public class InsertWordcloudElements { + private static final Logger logger = LogManager.getLogger(InsertWordcloudElements.class); + private String search; + + + InsertWordcloudElements(Map cloudwords, ArrayList nonoWords, int neededFreq, String search) { + this.search = search; + + if(!nonoWords.isEmpty()) { + Map filterMap = filter(cloudwords, nonoWords); + enterWordcloudElements(filterMap, neededFreq); + }else { + enterWordcloudElements(cloudwords, neededFreq); + } + + createCSVFile(); + writeCSVFile(cloudwords); + } + + + + public Map filter(Map words,ArrayList badWords) { + words.keySet().removeIf(badWords::contains); + return words; + } + + + + public void createCSVFile() { + try { + File file = new File("src/test/resources/woerter.csv"); + if(file.createNewFile()) { + logger.info("Die csv Datei wurde erstellt"); + } + else { + logger.info("csv datei exestiert bereits"); + } + }catch(Exception e) { + logger.error("Fehler beim erstellen der csv Datei.", e); + e.printStackTrace(); + } + } + + + public void writeCSVFile(Map words) { + try { + FileWriter write = new FileWriter("src/test/resources/woerter.csv"); + for(String word: words.keySet()) { + write.write(word+", "+words.get(word)+",\n"); + } + write.close(); + logger.info("csv datei wurde erfolgreich berschrieben."); + }catch(Exception e) { + logger.error("Datei konnte nicht beschrieben werden."); + e.printStackTrace(); + } + } + + + + public void enterWordcloudElements(Map words, int neededFreq) { + + int minFreq = Collections.min(words.values()); + int maxFreq = Collections.max(words.values()); + + String filepath = "site/wordcloud.html"; + + try { + List lines = Files.readAllLines(Paths.get(filepath)); + List updateLines = new ArrayList<>(); + + boolean inOldSpanBlock = false; + + for (String line : lines) { + + + if (inOldSpanBlock) { + if (line.contains("")) { + + updateLines.add(line); + inOldSpanBlock = false; + } + continue; + } + + + updateLines.add(line); + + if (line.contains("")) { + int idCounter = 0; + + for (String key : words.keySet()) { + if (words.get(key) < neededFreq) { + continue; + } + + String tagClass = getTagcloudClass(words.get(key), minFreq, maxFreq); + + String word = "" + + key + ""; + + updateLines.add(word); + idCounter++; + } + + + inOldSpanBlock = true; + } + } + + + Files.write(Paths.get(filepath), updateLines); + + } catch (IOException e) { + System.out.println("Fehler beim Lesen oder Schreiben der Datei."); + e.printStackTrace(); + } + } + + + public String getTagcloudClass(int frequency, int minFreq, int maxFreq) { + if (maxFreq == minFreq) { + return "tagcloud5"; + } + + + int range = maxFreq - minFreq; + int relativeValue = (int) Math.round(10.0 * (frequency - minFreq) / range); + + return "tagcloud" + relativeValue; + } + + + } + diff --git a/de.hs-mannheim.informatik.wordcloud/src/test/java/de/hs_mannheim/informatik/wordcloud/test/test.java b/de.hs-mannheim.informatik.wordcloud/src/test/java/de/hs_mannheim/informatik/wordcloud/test/test.java new file mode 100644 index 0000000..c039dda --- /dev/null +++ b/de.hs-mannheim.informatik.wordcloud/src/test/java/de/hs_mannheim/informatik/wordcloud/test/test.java @@ -0,0 +1,12 @@ +package de.hs_mannheim.informatik.wordcloud.test; + +import java.io.FileNotFoundException; + +public class test { + + public static void main(String[] args) throws FileNotFoundException{ + new UserInterface(); + + } + +}