some refactoring and cleared a bug with PDFDocument file opening warning
parent
19350fc80c
commit
59857d1173
|
@ -41,6 +41,7 @@ public class FileLoader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//detect format from file for further processing
|
||||||
public String getFileFormat(File path) {
|
public String getFileFormat(File path) {
|
||||||
String fileName = path.getName();
|
String fileName = path.getName();
|
||||||
String fileFormat = fileName.contains(".") ? fileName.substring(fileName.lastIndexOf(".") + 1) : "";
|
String fileFormat = fileName.contains(".") ? fileName.substring(fileName.lastIndexOf(".") + 1) : "";
|
||||||
|
|
|
@ -24,6 +24,7 @@ public class TextProcessing {
|
||||||
private int maxWords = 0;
|
private int maxWords = 0;
|
||||||
private Set<String> stopwordList = new HashSet<>();
|
private Set<String> stopwordList = new HashSet<>();
|
||||||
|
|
||||||
|
//Extract text from file with supported format
|
||||||
public String formatToText(File file, String format) {
|
public String formatToText(File file, String format) {
|
||||||
StringBuilder text = new StringBuilder();
|
StringBuilder text = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
@ -38,9 +39,10 @@ public class TextProcessing {
|
||||||
}
|
}
|
||||||
return text.toString();
|
return text.toString();
|
||||||
case "pdf":
|
case "pdf":
|
||||||
PDDocument document = PDDocument.load(file);
|
try (PDDocument document = PDDocument.load(file)) {
|
||||||
PDFTextStripper pdfStripper = new PDFTextStripper();
|
PDFTextStripper pdfStripper = new PDFTextStripper();
|
||||||
return pdfStripper.getText(document);
|
return pdfStripper.getText(document);
|
||||||
|
}
|
||||||
|
|
||||||
case "docx":
|
case "docx":
|
||||||
XWPFDocument officeDocument = new XWPFDocument(new FileInputStream(file));
|
XWPFDocument officeDocument = new XWPFDocument(new FileInputStream(file));
|
||||||
|
@ -67,6 +69,7 @@ public class TextProcessing {
|
||||||
return text.toString();
|
return text.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Set stopwords in list
|
||||||
public void textToSetStopwords(Map<String, Integer> words) {
|
public void textToSetStopwords(Map<String, Integer> words) {
|
||||||
Set<String> stopwords = new HashSet<>();
|
Set<String> stopwords = new HashSet<>();
|
||||||
for (Map.Entry<String, Integer> entry : words.entrySet()) {
|
for (Map.Entry<String, Integer> entry : words.entrySet()) {
|
||||||
|
@ -75,10 +78,12 @@ public class TextProcessing {
|
||||||
stopwordList.addAll(stopwords);
|
stopwordList.addAll(stopwords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Set stopword in list
|
||||||
public void addToStopWords(String stopword) {
|
public void addToStopWords(String stopword) {
|
||||||
stopwordList.add(stopword);
|
stopwordList.add(stopword);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Set maxwords for html
|
||||||
public Map<String, Integer> maxShowWords(Map<String, Integer> words, int maxWords) {
|
public Map<String, Integer> maxShowWords(Map<String, Integer> words, int maxWords) {
|
||||||
HashMap <String, Integer> cuttedHashmap = new HashMap<>();
|
HashMap <String, Integer> cuttedHashmap = new HashMap<>();
|
||||||
int index = maxWords;
|
int index = maxWords;
|
||||||
|
@ -91,6 +96,7 @@ public class TextProcessing {
|
||||||
return cuttedHashmap;
|
return cuttedHashmap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Sort List for cutting the map with words
|
||||||
public Map<String, Integer> sortList(Map<String, Integer> unsortedMap) {
|
public Map<String, Integer> sortList(Map<String, Integer> unsortedMap) {
|
||||||
List<Map.Entry<String, Integer>> entryList = new ArrayList<>(unsortedMap.entrySet());
|
List<Map.Entry<String, Integer>> entryList = new ArrayList<>(unsortedMap.entrySet());
|
||||||
|
|
||||||
|
@ -103,6 +109,7 @@ public class TextProcessing {
|
||||||
return sortedMap;
|
return sortedMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//tokenizing, stemming, lowercasing, stopwordfiltering Method Apachi Lucene
|
||||||
public Map<String, Integer> tokenizingFile(String text) {
|
public Map<String, Integer> tokenizingFile(String text) {
|
||||||
Map<String, Integer> words = new HashMap<>();
|
Map<String, Integer> words = new HashMap<>();
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ import java.net.URL;
|
||||||
public class URLContentLoader {
|
public class URLContentLoader {
|
||||||
private String urlPath;
|
private String urlPath;
|
||||||
|
|
||||||
|
//extract Content from URL
|
||||||
public String loadURLContent() {
|
public String loadURLContent() {
|
||||||
StringBuilder text = new StringBuilder();
|
StringBuilder text = new StringBuilder();
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Map;
|
||||||
public class WordCloudCreator {
|
public class WordCloudCreator {
|
||||||
private int maxFontSize = 70;
|
private int maxFontSize = 70;
|
||||||
|
|
||||||
|
//Create html file with clickable words
|
||||||
public boolean insertWordsIntoTemplate(Map<String, Integer> wordMap) {
|
public boolean insertWordsIntoTemplate(Map<String, Integer> wordMap) {
|
||||||
File templateFile = new File("wordcloud.html"); // Template in project directory
|
File templateFile = new File("wordcloud.html"); // Template in project directory
|
||||||
File outputFile = new File("createdHTML.html"); // Output in project directory
|
File outputFile = new File("createdHTML.html"); // Output in project directory
|
||||||
|
@ -28,7 +29,7 @@ public class WordCloudCreator {
|
||||||
int frequency = entry.getValue();
|
int frequency = entry.getValue();
|
||||||
int fontSize = Math.min(10 + frequency * 2, maxFontSize); // Example: Base size 10px, increase by 2px per frequency
|
int fontSize = Math.min(10 + frequency * 2, maxFontSize); // Example: Base size 10px, increase by 2px per frequency
|
||||||
wordEntries.append(String.format(
|
wordEntries.append(String.format(
|
||||||
"<span id=\"%d\" class=\"wrd\" style=\"font-size:%dpx;\">" +
|
"<span id=\"%d\" class=\"wrd\" style=\"font-size:%dpx; margin-right:10px\">" +
|
||||||
"<a href=\"https://www.google.com/search?q=%s\" target=\"_blank\">%s</a>" +
|
"<a href=\"https://www.google.com/search?q=%s\" target=\"_blank\">%s</a>" +
|
||||||
"</span>\n",
|
"</span>\n",
|
||||||
id++, fontSize, word, word
|
id++, fontSize, word, word
|
||||||
|
|
|
@ -64,7 +64,7 @@ public class TUI {
|
||||||
public void fileMenu() {
|
public void fileMenu() {
|
||||||
while(fMenu) {
|
while(fMenu) {
|
||||||
System.out.println("(0) Load Stopwords\n(1) Add to Stopwords\n(2) Set Max Words in HTML\n" +
|
System.out.println("(0) Load Stopwords\n(1) Add to Stopwords\n(2) Set Max Words in HTML\n" +
|
||||||
"(3) Stemming not functioning!\n(4) Create WordCloud and Exit");
|
"(3) Activate German stemming\n(4) Create WordCloud and Exit");
|
||||||
option = Integer.parseInt(scan.nextLine());
|
option = Integer.parseInt(scan.nextLine());
|
||||||
switch(option) {
|
switch(option) {
|
||||||
case (0):
|
case (0):
|
||||||
|
@ -74,7 +74,8 @@ public class TUI {
|
||||||
System.out.println("File loaded successful!\n");
|
System.out.println("File loaded successful!\n");
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
System.out.println("Cannot load one more File!\n");
|
System.out.println("Cannot load one more File! Please use for more stopwords words " +
|
||||||
|
"menu option (1).\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
Loading…
Reference in New Issue