BA-Chatbot/data_service/crawler/hsma_url_crawler.py

import scrapy
import json
# This crawls all availiable urls for crawling from the hs-mannheim domain
# RUN WITH "scrapy runspider hsma_crawler.py"

class MySpider(scrapy.Spider):
    name = 'myspider'
    allowed_domains = ["hs-mannheim.de"]
    start_urls = ["https://www.hs-mannheim.de/", "https://www.startup.hs-mannheim.de/"]
    custom_settings = {
        'LOG_LEVEL': 'INFO',
        'ROBOTSTXT_OBEY': True,
        'DEPTH_LIMIT': 1,
    }

    # Initialize the list
    urls = set()

    def parse(self, response):
        # Follow all links on the page
        for href in response.css('a::attr(href)').getall():
            url = response.urljoin(href)
            self.urls.add(url)

    def closed(self, reason):
        # When spider closes, write URLs to file
       with open('urls.json', 'w') as f:
            json.dump(list(self.urls), f)
initial 2023-11-15 14:28:48 +01:00			`import scrapy`
			`import json`
			`# This crawls all availiable urls for crawling from the hs-mannheim domain`
			`# RUN WITH "scrapy runspider hsma_crawler.py"`

			`class MySpider(scrapy.Spider):`
			`name = 'myspider'`
			`allowed_domains = ["hs-mannheim.de"]`
			`start_urls = ["https://www.hs-mannheim.de/", "https://www.startup.hs-mannheim.de/"]`
			`custom_settings = {`
			`'LOG_LEVEL': 'INFO',`
			`'ROBOTSTXT_OBEY': True,`
			`'DEPTH_LIMIT': 1,`
			`}`

			`# Initialize the list`
			`urls = set()`

			`def parse(self, response):`
			`# Follow all links on the page`
			`for href in response.css('a::attr(href)').getall():`
			`url = response.urljoin(href)`
			`self.urls.add(url)`

			`def closed(self, reason):`
			`# When spider closes, write URLs to file`
			`with open('urls.json', 'w') as f:`
			`json.dump(list(self.urls), f)`