import scrapy import json # This crawls all availiable urls for crawling from the hs-mannheim domain # RUN WITH "scrapy runspider hsma_crawler.py" class MySpider(scrapy.Spider): name = 'myspider' allowed_domains = ["hs-mannheim.de"] start_urls = ["https://www.hs-mannheim.de/", "https://www.startup.hs-mannheim.de/"] custom_settings = { 'LOG_LEVEL': 'INFO', 'ROBOTSTXT_OBEY': True, 'DEPTH_LIMIT': 1, } # Initialize the list urls = set() def parse(self, response): # Follow all links on the page for href in response.css('a::attr(href)').getall(): url = response.urljoin(href) self.urls.add(url) def closed(self, reason): # When spider closes, write URLs to file with open('urls.json', 'w') as f: json.dump(list(self.urls), f)