package domain; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFSlide; import org.apache.poi.xslf.usermodel.XSLFShape; import org.apache.poi.xslf.usermodel.XSLFTextShape; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.de.GermanStemmer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.CharArraySet; import java.io.IOException; import java.util.*; import java.io.*; public class TextProcessing { private boolean stemming; private int maxWords; // für spätere verwendung mit umfangreichen anpassungen // public boolean isStemming() { // return stemming; // } // // public int getMaxWords() { // return maxWords; // } // // public void setStemming(boolean stemming) { // this.stemming = stemming; // } // public void setMaxWords(int maxWords) { this.maxWords = maxWords; } public String formatToText(File file, String format) { try { StringBuilder text = new StringBuilder(); if (file != null) { switch (format) { case "txt": FileReader fileReader = new FileReader(file); BufferedReader reader = new BufferedReader(fileReader); String line; while((line = reader.readLine()) != null) { text.append(line).append("\n"); } return text.toString(); case "pdf": PDDocument document = PDDocument.load(file); PDFTextStripper pdfStripper = new PDFTextStripper(); return pdfStripper.getText(document); case "docx": XWPFDocument officeDocument = new XWPFDocument(new FileInputStream(file)); for(XWPFParagraph paragraph : officeDocument.getParagraphs()) { text.append(paragraph.getText()).append("\n"); } return text.toString(); case "pptx": XMLSlideShow ppt = new XMLSlideShow(new FileInputStream(file)); for (XSLFSlide slide : ppt.getSlides()) { for (XSLFShape shape : slide.getShapes()) { if (shape instanceof XSLFTextShape) { text.append(((XSLFTextShape) shape).getText()).append("\n"); } } } return text.toString(); } } } catch (IOException e) { throw new RuntimeException(e); } return "Nothing found!"; } public Map maxShowWords(Map words) { HashMap cuttedHashmap = new HashMap<>(); int index = maxWords; for (String word : words.keySet()) { if(index > 0) { cuttedHashmap.put(word, words.get(word)); } index--; } return cuttedHashmap; } //KI Methode die abgeändert wurde, damit sie in dieses Programm passt public Map tokenizingFile(String text, Set stopwords) { Map words = new HashMap<>(); if (text == null || text.isBlank()) { return words; } CharArraySet luceneStopwords = stopwords != null ? new CharArraySet(stopwords, true) : CharArraySet.EMPTY_SET; try (Analyzer analyzer = new StandardAnalyzer(luceneStopwords)) { TokenStream tokenStream = analyzer.tokenStream(null, text); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String word = charTermAttribute.toString(); if (words.containsKey(word)) { words.compute(word, (k, counter) -> counter + 1); } else { words.put(word, 1); } } tokenStream.end(); } catch (IOException e) { throw new RuntimeException(e); } return words; } public Set textToSetStopwords(Map words) { Set stopwordList = new HashSet<>(); for (Map.Entry entry : words.entrySet()) { stopwordList.add(entry.getKey()); } return stopwordList; } public String fileToTextString(File path, String format) { String text = formatToText(path, format); return text; } public Map sortList(Map unsortedMap) { List> entryList = new ArrayList<>(unsortedMap.entrySet()); entryList.sort((e1, e2) -> e2.getValue().compareTo(e1.getValue())); //Ki erstellte Zeile Map sortedMap = new LinkedHashMap<>(); for (Map.Entry entry : entryList) { sortedMap.put(entry.getKey(), entry.getValue()); } return sortedMap; } // public Map stemming(Map wordList) { // Map wordCounts = new HashMap<>(); // GermanStemmer stemmer = new GermanStemmer(); // // for (String key: wordList.keySet()) { // String stemmedWord = stemmer.stemWord(key); // if (stemmedWord != null) { // wordCounts.merge(stemmedWord, 1, Integer::sum); // } // } // return wordCounts; // } }