BA-Chatbot/data_service/crawler/hsma_url_crawler.py

29 lines
857 B
Python
Raw Permalink Normal View History

2023-11-15 14:28:48 +01:00
import scrapy
import json
# This crawls all availiable urls for crawling from the hs-mannheim domain
# RUN WITH "scrapy runspider hsma_crawler.py"
class MySpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ["hs-mannheim.de"]
start_urls = ["https://www.hs-mannheim.de/", "https://www.startup.hs-mannheim.de/"]
custom_settings = {
'LOG_LEVEL': 'INFO',
'ROBOTSTXT_OBEY': True,
'DEPTH_LIMIT': 1,
}
# Initialize the list
urls = set()
def parse(self, response):
# Follow all links on the page
for href in response.css('a::attr(href)').getall():
url = response.urljoin(href)
self.urls.add(url)
def closed(self, reason):
# When spider closes, write URLs to file
with open('urls.json', 'w') as f:
json.dump(list(self.urls), f)