169 lines
6.1 KiB
Java
169 lines
6.1 KiB
Java
package domain;
|
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import org.apache.pdfbox.text.PDFTextStripper;
|
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
|
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
|
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
|
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
|
import org.apache.poi.xslf.usermodel.XSLFShape;
|
|
import org.apache.poi.xslf.usermodel.XSLFTextShape;
|
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
|
import org.apache.lucene.analysis.Analyzer;
|
|
import org.apache.lucene.analysis.TokenStream;
|
|
import org.apache.lucene.analysis.de.GermanStemmer;
|
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
import org.apache.lucene.analysis.CharArraySet;
|
|
|
|
import java.io.IOException;
|
|
import java.util.*;
|
|
import java.io.*;
|
|
|
|
public class TextProcessing {
|
|
private boolean stemming;
|
|
private int maxWords;
|
|
|
|
// für spätere verwendung mit umfangreichen anpassungen
|
|
// public boolean isStemming() {
|
|
// return stemming;
|
|
// }
|
|
//
|
|
// public int getMaxWords() {
|
|
// return maxWords;
|
|
// }
|
|
//
|
|
// public void setStemming(boolean stemming) {
|
|
// this.stemming = stemming;
|
|
// }
|
|
//
|
|
public void setMaxWords(int maxWords) {
|
|
this.maxWords = maxWords;
|
|
}
|
|
|
|
public String formatToText(File file, String format) {
|
|
try {
|
|
StringBuilder text = new StringBuilder();
|
|
if (file != null) {
|
|
switch (format) {
|
|
case "txt":
|
|
FileReader fileReader = new FileReader(file);
|
|
BufferedReader reader = new BufferedReader(fileReader);
|
|
String line;
|
|
while((line = reader.readLine()) != null) {
|
|
text.append(line).append("\n");
|
|
}
|
|
return text.toString();
|
|
case "pdf":
|
|
PDDocument document = PDDocument.load(file);
|
|
PDFTextStripper pdfStripper = new PDFTextStripper();
|
|
return pdfStripper.getText(document);
|
|
|
|
case "docx":
|
|
XWPFDocument officeDocument = new XWPFDocument(new FileInputStream(file));
|
|
for(XWPFParagraph paragraph : officeDocument.getParagraphs()) {
|
|
text.append(paragraph.getText()).append("\n");
|
|
}
|
|
return text.toString();
|
|
case "pptx":
|
|
XMLSlideShow ppt = new XMLSlideShow(new FileInputStream(file));
|
|
for (XSLFSlide slide : ppt.getSlides()) {
|
|
for (XSLFShape shape : slide.getShapes()) {
|
|
if (shape instanceof XSLFTextShape) {
|
|
text.append(((XSLFTextShape) shape).getText()).append("\n");
|
|
}
|
|
}
|
|
}
|
|
return text.toString();
|
|
}
|
|
}
|
|
}
|
|
catch (IOException e) {
|
|
throw new RuntimeException(e);
|
|
}
|
|
return "Nothing found!";
|
|
}
|
|
|
|
public Map<String, Integer> maxShowWords(Map<String, Integer> words) {
|
|
HashMap <String, Integer> cuttedHashmap = new HashMap<>();
|
|
int index = maxWords;
|
|
for (String word : words.keySet()) {
|
|
if(index > 0) {
|
|
cuttedHashmap.put(word, words.get(word));
|
|
}
|
|
index--;
|
|
}
|
|
return cuttedHashmap;
|
|
}
|
|
|
|
//KI Methode die abgeändert wurde, damit sie in dieses Programm passt
|
|
public Map<String, Integer> tokenizingFile(String text, Set<String> stopwords) {
|
|
Map<String, Integer> words = new HashMap<>();
|
|
|
|
if (text == null || text.isBlank()) {
|
|
return words;
|
|
}
|
|
CharArraySet luceneStopwords =
|
|
stopwords != null ? new CharArraySet(stopwords, true) : CharArraySet.EMPTY_SET;
|
|
|
|
try (Analyzer analyzer = new StandardAnalyzer(luceneStopwords)) {
|
|
TokenStream tokenStream = analyzer.tokenStream(null, text);
|
|
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
|
|
|
tokenStream.reset();
|
|
while (tokenStream.incrementToken()) {
|
|
String word = charTermAttribute.toString();
|
|
if (words.containsKey(word)) {
|
|
words.compute(word, (k, counter) -> counter + 1);
|
|
}
|
|
else {
|
|
words.put(word, 1);
|
|
}
|
|
}
|
|
tokenStream.end();
|
|
}
|
|
catch (IOException e) {
|
|
throw new RuntimeException(e);
|
|
}
|
|
return words;
|
|
}
|
|
|
|
public Set<String> textToSetStopwords(Map<String, Integer> words) {
|
|
Set<String> stopwordList = new HashSet<>();
|
|
for (Map.Entry<String, Integer> entry : words.entrySet()) {
|
|
stopwordList.add(entry.getKey());
|
|
}
|
|
return stopwordList;
|
|
}
|
|
|
|
public String fileToTextString(File path, String format) {
|
|
String text = formatToText(path, format);
|
|
return text;
|
|
}
|
|
|
|
public Map<String, Integer> sortList(Map<String, Integer> unsortedMap) {
|
|
List<Map.Entry<String, Integer>> entryList = new ArrayList<>(unsortedMap.entrySet());
|
|
|
|
entryList.sort((e1, e2) -> e2.getValue().compareTo(e1.getValue())); //Ki erstellte Zeile
|
|
|
|
Map<String, Integer> sortedMap = new LinkedHashMap<>();
|
|
for (Map.Entry<String, Integer> entry : entryList) {
|
|
sortedMap.put(entry.getKey(), entry.getValue());
|
|
}
|
|
|
|
return sortedMap;
|
|
}
|
|
|
|
// public Map<String, Integer> stemming(Map<String, Integer> wordList) {
|
|
// Map<String, Integer> wordCounts = new HashMap<>();
|
|
// GermanStemmer stemmer = new GermanStemmer();
|
|
//
|
|
// for (String key: wordList.keySet()) {
|
|
// String stemmedWord = stemmer.stemWord(key);
|
|
// if (stemmedWord != null) {
|
|
// wordCounts.merge(stemmedWord, 1, Integer::sum);
|
|
// }
|
|
// }
|
|
// return wordCounts;
|
|
// }
|
|
}
|