implement switch case for textbuilding from pptx, docx, txt and add comments for Method loadFileGUI

pull/5/head
Daniel Fromm 2025-05-08 22:09:30 +02:00
parent af33d8a566
commit d9ae97aea4
3 changed files with 62 additions and 21 deletions

20
pom.xml
View File

@ -14,6 +14,26 @@
<artifactId>pdfbox</artifactId> <artifactId>pdfbox</artifactId>
<version>2.0.29</version> <version>2.0.29</version>
</dependency> </dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.xmlbeans</groupId>
<artifactId>xmlbeans</artifactId>
<version>5.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.18.0</version>
</dependency>
</dependencies> </dependencies>
<properties> <properties>

View File

@ -14,14 +14,14 @@ public class FileLoader {
public FileLoader() { public FileLoader() {
this.inputFile = null; this.inputFile = null;
} }
//KI erstellte Methode
public File loadFileGUI() { public File loadFileGUI() {
try { try {
JFileChooser fileChooser = new JFileChooser(); JFileChooser fileChooser = new JFileChooser();
fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("PDF Files", "pdf")); fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("PDF Files", "pdf"));
fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("Text Files", "txt")); fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("Text Files", "txt")); //selbst hinzugefügt
fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("Word Documents", "docx")); fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("Word Documents", "docx")); //selbst hinzugefügt
fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("PowerPoint Presentations", "pptx")); fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("PowerPoint Presentations", "pptx")); //selbst hinzugefügt
int result = fileChooser.showOpenDialog(null); int result = fileChooser.showOpenDialog(null);
if (result == JFileChooser.APPROVE_OPTION) { if (result == JFileChooser.APPROVE_OPTION) {

View File

@ -2,6 +2,12 @@ package domain;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFTextShape;
import java.io.*; import java.io.*;
import java.util.HashMap; import java.util.HashMap;
@ -9,31 +15,44 @@ import java.util.HashMap;
public class TextProcessing { public class TextProcessing {
public String formatToText(File file, String format) { public String formatToText(File file, String format) {
PDDocument document;
try { try {
StringBuilder text = new StringBuilder();
if (file != null) { if (file != null) {
switch (format) { switch (format) {
case "txt": case "txt":
FileReader fileReader = new FileReader(file);
break; BufferedReader reader = new BufferedReader(fileReader);
String line;
while((line = reader.readLine()) != null) {
text.append(line).append("\n");
}
return text.toString();
case "pdf": case "pdf":
document = PDDocument.load(file); PDDocument document = PDDocument.load(file);
PDFTextStripper pdfStripper = new PDFTextStripper(); PDFTextStripper pdfStripper = new PDFTextStripper();
return pdfStripper.getText(document); return pdfStripper.getText(document);
case "docx": case "docx":
XWPFDocument officeDocument = new XWPFDocument(new FileInputStream(file));
break; for(XWPFParagraph paragraph : officeDocument.getParagraphs()) {
text.append(paragraph.getText()).append("\n");
}
return text.toString();
case "pptx": case "pptx":
XMLSlideShow ppt = new XMLSlideShow(new FileInputStream(file));
break; for (XSLFSlide slide : ppt.getSlides()) {
for (XSLFShape shape : slide.getShapes()) {
if (shape instanceof XSLFTextShape) {
text.append(((XSLFTextShape) shape).getText()).append("\n");
}
}
}
return text.toString();
} }
} }
} }
catch (IOException ex) { catch (IOException e) {
throw new RuntimeException(ex); throw new RuntimeException(e);
} }
return "Nothing found!"; return "Nothing found!";
} }
@ -53,18 +72,20 @@ public class TextProcessing {
public HashMap tokenizingText(String text){ public HashMap tokenizingText(String text){
HashMap<String, Integer> filteredWords = new HashMap<>(); HashMap<String, Integer> filteredWords = new HashMap<>();
try { try {
if(!text.isEmpty()) {
//Tokenizing der Wörter //Tokenizing der Wörter
String splitter = "[,\\s\\.:/!§$%&/()=?+*~#.;_<>^°\"']"; String splitter = "[,\\s\\.:/!§$%&/()=?+*~#.;_<\\->^°\"']";
String[] textWords = text.split(splitter); String[] textWords = text.split(splitter);
for(String word : textWords){ for (String word : textWords) {
if (filteredWords.containsKey(word)) { if (filteredWords.containsKey(word)) {
filteredWords.compute(word, (k, counter) -> counter + 1); filteredWords.compute(word, (k, counter) -> counter + 1);
} } else {
else {
filteredWords.put(word, 1); filteredWords.put(word, 1);
} }
} }
} catch (Exception ex) { }
}
catch (Exception ex) {
throw new RuntimeException(ex); throw new RuntimeException(ex);
} }
return filteredWords; return filteredWords;