From ffda6973d21751692f40379fc2b19837ef2a178f Mon Sep 17 00:00:00 2001 From: Daniel Fromm <3015351@stud.hs-mannheim.de> Date: Sun, 11 May 2025 20:57:06 +0200 Subject: [PATCH] added few methods for processing --- src/main/java/domain/TextProcessing.java | 112 +++++++++++++++++++---- 1 file changed, 93 insertions(+), 19 deletions(-) diff --git a/src/main/java/domain/TextProcessing.java b/src/main/java/domain/TextProcessing.java index c6b2e20..133b145 100644 --- a/src/main/java/domain/TextProcessing.java +++ b/src/main/java/domain/TextProcessing.java @@ -8,11 +8,36 @@ import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFSlide; import org.apache.poi.xslf.usermodel.XSLFShape; import org.apache.poi.xslf.usermodel.XSLFTextShape; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.de.GermanStemmer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.CharArraySet; +import java.io.IOException; +import java.util.*; import java.io.*; -import java.util.HashMap; public class TextProcessing { + private boolean stemming; + private int maxWords; + + public boolean isStemming() { + return stemming; + } + + public int getMaxWords() { + return maxWords; + } + + public void setStemming(boolean stemming) { + this.stemming = stemming; + } + + public void setMaxWords(int maxWords) { + this.maxWords = maxWords; + } public String formatToText(File file, String format) { try { @@ -57,9 +82,9 @@ public class TextProcessing { return "Nothing found!"; } - public HashMap maxShowWords(int number, HashMap words) { + public Map maxShowWords(Map words) { HashMap cuttedHashmap = new HashMap<>(); - int index = number; + int index = maxWords; for (String word : words.keySet()) { if(index > 0) { cuttedHashmap.put(word, words.get(word)); @@ -69,25 +94,74 @@ public class TextProcessing { return cuttedHashmap; } - public HashMap tokenizingText(String text){ - HashMap filteredWords = new HashMap<>(); - try { - if(!text.isEmpty()) { - //Tokenizing der Wörter - String splitter = "[,\\s\\.:/!§$%&/()=?+*~#.;_<\\-–>^°\"']"; - String[] textWords = text.split(splitter); - for (String word : textWords) { - if (filteredWords.containsKey(word)) { - filteredWords.compute(word, (k, counter) -> counter + 1); - } else { - filteredWords.put(word, 1); + //KI Methode die abgeändert wurde, damit sie in dieses Programm passt + public Map tokenizingFile(String text, Set stopwords) { + Map words = new HashMap<>(); + + if (text == null || text.isBlank()) { + return words; + } + CharArraySet luceneStopwords = + stopwords != null ? new CharArraySet(stopwords, true) : CharArraySet.EMPTY_SET; + + try (Analyzer analyzer = new StandardAnalyzer(luceneStopwords)) { + TokenStream tokenStream = analyzer.tokenStream(null, text); + CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); + + tokenStream.reset(); + while (tokenStream.incrementToken()) { + String word = charTermAttribute.toString(); + if (words.containsKey(word)) { + words.compute(word, (k, counter) -> counter + 1); + } + else { + words.put(word, 1); } - } } + tokenStream.end(); } - catch (Exception ex) { - throw new RuntimeException(ex); + catch (IOException e) { + throw new RuntimeException(e); } - return filteredWords; + return words; } + + public Set textToSetStopwords(Map words) { + Set stopwordList = new HashSet<>(); + for (Map.Entry entry : words.entrySet()) { + stopwordList.add(entry.getKey()); + } + return stopwordList; + } + + public String fileToTextString(File path, String format) { + String text = formatToText(path, format); + return text; + } + + public Map sortList(Map unsortedMap) { + List> entryList = new ArrayList<>(unsortedMap.entrySet()); + + entryList.sort((e1, e2) -> e2.getValue().compareTo(e1.getValue())); //Ki erstellte Zeile + + Map sortedMap = new LinkedHashMap<>(); + for (Map.Entry entry : entryList) { + sortedMap.put(entry.getKey(), entry.getValue()); + } + + return sortedMap; + } + +// public Map stemming(Map wordList) { +// Map wordCounts = new HashMap<>(); +// GermanStemmer stemmer = new GermanStemmer(); +// +// for (String key: wordList.keySet()) { +// String stemmedWord = stemmer.stemWord(key); +// if (stemmedWord != null) { +// wordCounts.merge(stemmedWord, 1, Integer::sum); +// } +// } +// return wordCounts; +// } }