import scrapy import json # THIS crawls every content from a provided list of urls. # NOTE: This means that we have to execute previously the hsma_url_crawler.py with the CMD scrapy runspider hsma_crawler.py # NOTE: Then exectue this file with scrapy runspider hsma_content_crawler.py # NOTE: Move afterwards the generated "url_texts.json" into the /data directory and rename it to "crawled_hsma_web.json" # TODO: Automate the file moving process to the /data dir class MySpider(scrapy.Spider): name = 'myspider' allowed_domains = ["hs-mannheim.de"] custom_settings = { 'LOG_LEVEL': 'INFO', 'ROBOTSTXT_OBEY': True, 'DEPTH_LIMIT': 1, 'FEED_FORMAT':'json', 'FEED_URI': 'url_texts.json', 'FEED_EXPORT_ENCODING': 'utf-8' } def __init__(self): # Read the file and load the JSON with open('urls.json', 'r') as f: self.start_urls = json.load(f) def parse(self, response): # Remove script and style elements for script in response.xpath('//script | //style | //footer'): script.extract() # Ignore specific elements using XPath # Extract text from the remaining HTML elements text = response.xpath('//body//text()[not(ancestor::header or ancestor::nav or ancestor::footer or ancestor::script or ancestor::*[contains(@class, "cc-container")] or ancestor::*[contains(@class, "c-top-link")])]').getall() # Remove leading and trailing whitespace from each piece of text text = [t.strip() for t in text] # Remove empty strings text = [t for t in text if t != ''] # Join the pieces of text text = ' '.join(text) # Yield the scraped content yield {'url': response.url, 'content': text}