BA-Chatbot_Ali_Thesis/data_service/crawler/hsma_url_crawler.py

import scrapy
import json
# This crawls all availiable urls for crawling from the hs-mannheim domain
# RUN WITH "scrapy runspider hsma_crawler.py"

class MySpider(scrapy.Spider):
    name = 'myspider'
    allowed_domains = ["hs-mannheim.de"]
    start_urls = ["https://www.hs-mannheim.de/", "https://www.startup.hs-mannheim.de/"]
    custom_settings = {
        'LOG_LEVEL': 'INFO',
        'ROBOTSTXT_OBEY': True,
        'DEPTH_LIMIT': 1,
    }

    # Initialize the list
    urls = set()

    def parse(self, response):
        # Follow all links on the page
        for href in response.css('a::attr(href)').getall():
            url = response.urljoin(href)
            self.urls.add(url)

    def closed(self, reason):
        # When spider closes, write URLs to file
       with open('urls.json', 'w') as f:
            json.dump(list(self.urls), f)