PR2WordCloud/src/main/java/domain/TextProcessing.java

169 lines
6.1 KiB
Java

package domain;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFTextShape;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.de.GermanStemmer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.CharArraySet;
import java.io.IOException;
import java.util.*;
import java.io.*;
public class TextProcessing {
private boolean stemming;
private int maxWords;
// für spätere verwendung mit umfangreichen anpassungen
// public boolean isStemming() {
// return stemming;
// }
//
// public int getMaxWords() {
// return maxWords;
// }
//
// public void setStemming(boolean stemming) {
// this.stemming = stemming;
// }
//
public void setMaxWords(int maxWords) {
this.maxWords = maxWords;
}
public String formatToText(File file, String format) {
try {
StringBuilder text = new StringBuilder();
if (file != null) {
switch (format) {
case "txt":
FileReader fileReader = new FileReader(file);
BufferedReader reader = new BufferedReader(fileReader);
String line;
while((line = reader.readLine()) != null) {
text.append(line).append("\n");
}
return text.toString();
case "pdf":
PDDocument document = PDDocument.load(file);
PDFTextStripper pdfStripper = new PDFTextStripper();
return pdfStripper.getText(document);
case "docx":
XWPFDocument officeDocument = new XWPFDocument(new FileInputStream(file));
for(XWPFParagraph paragraph : officeDocument.getParagraphs()) {
text.append(paragraph.getText()).append("\n");
}
return text.toString();
case "pptx":
XMLSlideShow ppt = new XMLSlideShow(new FileInputStream(file));
for (XSLFSlide slide : ppt.getSlides()) {
for (XSLFShape shape : slide.getShapes()) {
if (shape instanceof XSLFTextShape) {
text.append(((XSLFTextShape) shape).getText()).append("\n");
}
}
}
return text.toString();
}
}
}
catch (IOException e) {
throw new RuntimeException(e);
}
return "Nothing found!";
}
public Map<String, Integer> maxShowWords(Map<String, Integer> words) {
HashMap <String, Integer> cuttedHashmap = new HashMap<>();
int index = maxWords;
for (String word : words.keySet()) {
if(index > 0) {
cuttedHashmap.put(word, words.get(word));
}
index--;
}
return cuttedHashmap;
}
//KI Methode die abgeändert wurde, damit sie in dieses Programm passt
public Map<String, Integer> tokenizingFile(String text, Set<String> stopwords) {
Map<String, Integer> words = new HashMap<>();
if (text == null || text.isBlank()) {
return words;
}
CharArraySet luceneStopwords =
stopwords != null ? new CharArraySet(stopwords, true) : CharArraySet.EMPTY_SET;
try (Analyzer analyzer = new StandardAnalyzer(luceneStopwords)) {
TokenStream tokenStream = analyzer.tokenStream(null, text);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
String word = charTermAttribute.toString();
if (words.containsKey(word)) {
words.compute(word, (k, counter) -> counter + 1);
}
else {
words.put(word, 1);
}
}
tokenStream.end();
}
catch (IOException e) {
throw new RuntimeException(e);
}
return words;
}
public Set<String> textToSetStopwords(Map<String, Integer> words) {
Set<String> stopwordList = new HashSet<>();
for (Map.Entry<String, Integer> entry : words.entrySet()) {
stopwordList.add(entry.getKey());
}
return stopwordList;
}
public String fileToTextString(File path, String format) {
String text = formatToText(path, format);
return text;
}
public Map<String, Integer> sortList(Map<String, Integer> unsortedMap) {
List<Map.Entry<String, Integer>> entryList = new ArrayList<>(unsortedMap.entrySet());
entryList.sort((e1, e2) -> e2.getValue().compareTo(e1.getValue())); //Ki erstellte Zeile
Map<String, Integer> sortedMap = new LinkedHashMap<>();
for (Map.Entry<String, Integer> entry : entryList) {
sortedMap.put(entry.getKey(), entry.getValue());
}
return sortedMap;
}
// public Map<String, Integer> stemming(Map<String, Integer> wordList) {
// Map<String, Integer> wordCounts = new HashMap<>();
// GermanStemmer stemmer = new GermanStemmer();
//
// for (String key: wordList.keySet()) {
// String stemmedWord = stemmer.stemWord(key);
// if (stemmedWord != null) {
// wordCounts.merge(stemmedWord, 1, Integer::sum);
// }
// }
// return wordCounts;
// }
}