BA-Chatbot/data_service/crawler/hsma_content_crawler.py



import scrapy
import json


# THIS crawls every content from a provided list of urls.
# NOTE: This means that we have to execute previously the hsma_url_crawler.py with the CMD scrapy runspider hsma_crawler.py
# NOTE: Then exectue this file with scrapy runspider hsma_content_crawler.py
# NOTE: Move afterwards the generated "url_texts.json" into the /data directory and rename it to "crawled_hsma_web.json"
# TODO: Automate the file moving process to the /data dir
class MySpider(scrapy.Spider):
    name = 'myspider'
    allowed_domains = ["hs-mannheim.de"]
    
    custom_settings = {
        'LOG_LEVEL': 'INFO',
        'ROBOTSTXT_OBEY': True,
        'DEPTH_LIMIT': 1,
        'FEED_FORMAT':'json',
        'FEED_URI': 'url_texts.json',
        'FEED_EXPORT_ENCODING': 'utf-8'

    }
    def __init__(self):
        # Read the file and load the JSON
        with open('urls.json', 'r') as f:
            self.start_urls = json.load(f)

    def parse(self, response):
        # Remove script and style elements
        for script in response.xpath('//script | //style | //footer'):
            script.extract()
        # Ignore specific elements using XPath
        # Extract text from the remaining HTML elements
        text = response.xpath('//body//text()[not(ancestor::header or ancestor::nav or ancestor::footer or ancestor::script or ancestor::*[contains(@class, "cc-container")] or ancestor::*[contains(@class, "c-top-link")])]').getall()
        # Remove leading and trailing whitespace from each piece of text
        text = [t.strip() for t in text]
        # Remove empty strings
        text = [t for t in text if t != '']
        # Join the pieces of text
        text = ' '.join(text)
        # Yield the scraped content
        yield {'url': response.url, 'content': text}
initial 2023-11-15 14:28:48 +01:00

			`import scrapy`
			`import json`


			`# THIS crawls every content from a provided list of urls.`
			`# NOTE: This means that we have to execute previously the hsma_url_crawler.py with the CMD scrapy runspider hsma_crawler.py`
			`# NOTE: Then exectue this file with scrapy runspider hsma_content_crawler.py`
			`# NOTE: Move afterwards the generated "url_texts.json" into the /data directory and rename it to "crawled_hsma_web.json"`
			`# TODO: Automate the file moving process to the /data dir`
			`class MySpider(scrapy.Spider):`
			`name = 'myspider'`
			`allowed_domains = ["hs-mannheim.de"]`

			`custom_settings = {`
			`'LOG_LEVEL': 'INFO',`
			`'ROBOTSTXT_OBEY': True,`
			`'DEPTH_LIMIT': 1,`
			`'FEED_FORMAT':'json',`
			`'FEED_URI': 'url_texts.json',`
			`'FEED_EXPORT_ENCODING': 'utf-8'`

			`}`
			`def __init__(self):`
			`# Read the file and load the JSON`
			`with open('urls.json', 'r') as f:`
			`self.start_urls = json.load(f)`

			`def parse(self, response):`
			`# Remove script and style elements`
			`for script in response.xpath('//script \| //style \| //footer'):`
			`script.extract()`
			`# Ignore specific elements using XPath`
			`# Extract text from the remaining HTML elements`
			`text = response.xpath('//body//text()[not(ancestor::header or ancestor::nav or ancestor::footer or ancestor::script or ancestor::[contains(@class, "cc-container")] or ancestor::[contains(@class, "c-top-link")])]').getall()`
			`# Remove leading and trailing whitespace from each piece of text`
			`text = [t.strip() for t in text]`
			`# Remove empty strings`
			`text = [t for t in text if t != '']`
			`# Join the pieces of text`
			`text = ' '.join(text)`
			`# Yield the scraped content`
			`yield {'url': response.url, 'content': text}`