BA-Chatbot/data_service/crawler/hsma_content_crawler.py

44 lines
1.8 KiB
Python
Raw Normal View History

2023-11-15 14:28:48 +01:00
import scrapy
import json
# THIS crawls every content from a provided list of urls.
# NOTE: This means that we have to execute previously the hsma_url_crawler.py with the CMD scrapy runspider hsma_crawler.py
# NOTE: Then exectue this file with scrapy runspider hsma_content_crawler.py
# NOTE: Move afterwards the generated "url_texts.json" into the /data directory and rename it to "crawled_hsma_web.json"
# TODO: Automate the file moving process to the /data dir
class MySpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ["hs-mannheim.de"]
custom_settings = {
'LOG_LEVEL': 'INFO',
'ROBOTSTXT_OBEY': True,
'DEPTH_LIMIT': 1,
'FEED_FORMAT':'json',
'FEED_URI': 'url_texts.json',
'FEED_EXPORT_ENCODING': 'utf-8'
}
def __init__(self):
# Read the file and load the JSON
with open('urls.json', 'r') as f:
self.start_urls = json.load(f)
def parse(self, response):
# Remove script and style elements
for script in response.xpath('//script | //style | //footer'):
script.extract()
# Ignore specific elements using XPath
# Extract text from the remaining HTML elements
text = response.xpath('//body//text()[not(ancestor::header or ancestor::nav or ancestor::footer or ancestor::script or ancestor::*[contains(@class, "cc-container")] or ancestor::*[contains(@class, "c-top-link")])]').getall()
# Remove leading and trailing whitespace from each piece of text
text = [t.strip() for t in text]
# Remove empty strings
text = [t for t in text if t != '']
# Join the pieces of text
text = ' '.join(text)
# Yield the scraped content
yield {'url': response.url, 'content': text}