some refactoring and implement stemming
parent
8a74b2a7b5
commit
19350fc80c
|
@ -12,7 +12,6 @@ import org.apache.poi.xslf.usermodel.XSLFTextShape;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.de.GermanStemmer;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
|
||||||
|
@ -26,8 +25,8 @@ public class TextProcessing {
|
||||||
private Set<String> stopwordList = new HashSet<>();
|
private Set<String> stopwordList = new HashSet<>();
|
||||||
|
|
||||||
public String formatToText(File file, String format) {
|
public String formatToText(File file, String format) {
|
||||||
|
StringBuilder text = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
StringBuilder text = new StringBuilder();
|
|
||||||
if (file != null) {
|
if (file != null) {
|
||||||
switch (format) {
|
switch (format) {
|
||||||
case "txt":
|
case "txt":
|
||||||
|
@ -65,7 +64,7 @@ public class TextProcessing {
|
||||||
catch (IOException e) {
|
catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
return "Nothing found!";
|
return text.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void textToSetStopwords(Map<String, Integer> words) {
|
public void textToSetStopwords(Map<String, Integer> words) {
|
||||||
|
@ -113,7 +112,7 @@ public class TextProcessing {
|
||||||
CharArraySet luceneStopwords = stopwordList != null ? new CharArraySet(stopwordList,
|
CharArraySet luceneStopwords = stopwordList != null ? new CharArraySet(stopwordList,
|
||||||
true) : CharArraySet.EMPTY_SET;
|
true) : CharArraySet.EMPTY_SET;
|
||||||
|
|
||||||
try (Analyzer analyzer = new StandardAnalyzer(luceneStopwords)) {
|
try (Analyzer analyzer = stemming ? new GermanAnalyzer(luceneStopwords) : new StandardAnalyzer(luceneStopwords)) {
|
||||||
TokenStream tokenStream = analyzer.tokenStream(null, text);
|
TokenStream tokenStream = analyzer.tokenStream(null, text);
|
||||||
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
|
|
|
@ -10,10 +10,6 @@ public class WordCloudCreator {
|
||||||
File templateFile = new File("wordcloud.html"); // Template in project directory
|
File templateFile = new File("wordcloud.html"); // Template in project directory
|
||||||
File outputFile = new File("createdHTML.html"); // Output in project directory
|
File outputFile = new File("createdHTML.html"); // Output in project directory
|
||||||
|
|
||||||
if (!templateFile.exists()) {
|
|
||||||
throw new RuntimeException("File not found!");
|
|
||||||
}
|
|
||||||
|
|
||||||
try (BufferedReader reader = new BufferedReader(new FileReader(templateFile));
|
try (BufferedReader reader = new BufferedReader(new FileReader(templateFile));
|
||||||
BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
|
BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,8 @@ public class TUI {
|
||||||
scan.close();
|
scan.close();
|
||||||
System.out.println("Close Program!");
|
System.out.println("Close Program!");
|
||||||
break;
|
break;
|
||||||
|
default:
|
||||||
|
System.out.println("Unknown input!");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
scan.close();
|
scan.close();
|
||||||
|
@ -93,11 +95,14 @@ public class TUI {
|
||||||
break;
|
break;
|
||||||
case(3):
|
case(3):
|
||||||
// Set Stemming
|
// Set Stemming
|
||||||
System.out.println("Stemming: Input 'yes' or 'no'? ");
|
System.out.println("Activate stemming? Type 'yes' or 'no'?\nOnly German stemming!");
|
||||||
String stemmingOption = scan.nextLine();
|
String stemmingOption = scan.nextLine();
|
||||||
if(stemmingOption.equals("yes")) {
|
if(stemmingOption.equals("yes")) {
|
||||||
wcm.setStemming(true);
|
wcm.setStemming(true);
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
System.out.println("Unknown Input!");
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case(4):
|
case(4):
|
||||||
//Create WordCloud and exit program
|
//Create WordCloud and exit program
|
||||||
|
@ -110,6 +115,8 @@ public class TUI {
|
||||||
System.out.println("HTML FIle not created!\n");
|
System.out.println("HTML FIle not created!\n");
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
default:
|
||||||
|
System.out.println("Unknown input!");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue