BA-Chatbot/data_service/crawler/hsma_content_crawler.py



import scrapy
import json


# THIS crawls every content from a provided list of urls.
# NOTE: This means that we have to execute previously the hsma_url_crawler.py with the CMD scrapy runspider hsma_crawler.py
# NOTE: Then exectue this file with scrapy runspider hsma_content_crawler.py
# NOTE: Move afterwards the generated "url_texts.json" into the /data directory and rename it to "crawled_hsma_web.json"
# TODO: Automate the file moving process to the /data dir
class MySpider(scrapy.Spider):
    name = 'myspider'
    allowed_domains = ["hs-mannheim.de"]

    custom_settings = {
        'LOG_LEVEL': 'INFO',
        'ROBOTSTXT_OBEY': True,
        'DEPTH_LIMIT': 1,
        'FEED_FORMAT':'json',
        'FEED_URI': 'url_texts.json',
        'FEED_EXPORT_ENCODING': 'utf-8'

    }
    def __init__(self):
        # Read the file and load the JSON
        with open('urls.json', 'r') as f:
            self.start_urls = json.load(f)

    def parse(self, response):
        # Remove script and style elements
        for script in response.xpath('//script | //style | //footer'):
            script.extract()
        # Ignore specific elements using XPath
        # Extract text from the remaining HTML elements
        text = response.xpath('//body//text()[not(ancestor::header or ancestor::nav or ancestor::footer or ancestor::script or ancestor::*[contains(@class, "cc-container")] or ancestor::*[contains(@class, "c-top-link")])]').getall()
        # Remove leading and trailing whitespace from each piece of text
        text = [t.strip() for t in text]
        # Remove empty strings
        text = [t for t in text if t != '']
        # Join the pieces of text
        text = ' '.join(text)
        # Yield the scraped content
        yield {'url': response.url, 'content': text}