29 lines
857 B
Python
29 lines
857 B
Python
|
import scrapy
|
||
|
import json
|
||
|
# This crawls all availiable urls for crawling from the hs-mannheim domain
|
||
|
# RUN WITH "scrapy runspider hsma_crawler.py"
|
||
|
|
||
|
class MySpider(scrapy.Spider):
|
||
|
name = 'myspider'
|
||
|
allowed_domains = ["hs-mannheim.de"]
|
||
|
start_urls = ["https://www.hs-mannheim.de/", "https://www.startup.hs-mannheim.de/"]
|
||
|
custom_settings = {
|
||
|
'LOG_LEVEL': 'INFO',
|
||
|
'ROBOTSTXT_OBEY': True,
|
||
|
'DEPTH_LIMIT': 1,
|
||
|
}
|
||
|
|
||
|
# Initialize the list
|
||
|
urls = set()
|
||
|
|
||
|
def parse(self, response):
|
||
|
# Follow all links on the page
|
||
|
for href in response.css('a::attr(href)').getall():
|
||
|
url = response.urljoin(href)
|
||
|
self.urls.add(url)
|
||
|
|
||
|
def closed(self, reason):
|
||
|
# When spider closes, write URLs to file
|
||
|
with open('urls.json', 'w') as f:
|
||
|
json.dump(list(self.urls), f)
|