44 lines
1.8 KiB
Python
44 lines
1.8 KiB
Python
|
|
||
|
|
||
|
import scrapy
|
||
|
import json
|
||
|
|
||
|
|
||
|
# THIS crawls every content from a provided list of urls.
|
||
|
# NOTE: This means that we have to execute previously the hsma_url_crawler.py with the CMD scrapy runspider hsma_crawler.py
|
||
|
# NOTE: Then exectue this file with scrapy runspider hsma_content_crawler.py
|
||
|
# NOTE: Move afterwards the generated "url_texts.json" into the /data directory and rename it to "crawled_hsma_web.json"
|
||
|
# TODO: Automate the file moving process to the /data dir
|
||
|
class MySpider(scrapy.Spider):
|
||
|
name = 'myspider'
|
||
|
allowed_domains = ["hs-mannheim.de"]
|
||
|
|
||
|
custom_settings = {
|
||
|
'LOG_LEVEL': 'INFO',
|
||
|
'ROBOTSTXT_OBEY': True,
|
||
|
'DEPTH_LIMIT': 1,
|
||
|
'FEED_FORMAT':'json',
|
||
|
'FEED_URI': 'url_texts.json',
|
||
|
'FEED_EXPORT_ENCODING': 'utf-8'
|
||
|
|
||
|
}
|
||
|
def __init__(self):
|
||
|
# Read the file and load the JSON
|
||
|
with open('urls.json', 'r') as f:
|
||
|
self.start_urls = json.load(f)
|
||
|
|
||
|
def parse(self, response):
|
||
|
# Remove script and style elements
|
||
|
for script in response.xpath('//script | //style | //footer'):
|
||
|
script.extract()
|
||
|
# Ignore specific elements using XPath
|
||
|
# Extract text from the remaining HTML elements
|
||
|
text = response.xpath('//body//text()[not(ancestor::header or ancestor::nav or ancestor::footer or ancestor::script or ancestor::*[contains(@class, "cc-container")] or ancestor::*[contains(@class, "c-top-link")])]').getall()
|
||
|
# Remove leading and trailing whitespace from each piece of text
|
||
|
text = [t.strip() for t in text]
|
||
|
# Remove empty strings
|
||
|
text = [t for t in text if t != '']
|
||
|
# Join the pieces of text
|
||
|
text = ' '.join(text)
|
||
|
# Yield the scraped content
|
||
|
yield {'url': response.url, 'content': text}
|