I am building my first scrapy and am trying to retrieve the urls in the sitemap. The code works, but it seems that scrapy is also already crawling the website urls themselves. I do not want that. How do I get only the urls in the sitemap? Thanks a lot for your advice:
import scrapy
from scrapy.crawler import CrawlerProcess
class SitemapSpider(scrapy.spiders.SitemapSpider):
name = "sitemap_spider"
def __init__(self, sitemap_url, *args, **kwargs):
super(SitemapSpider, self).__init__(*args, **kwargs)
self.sitemap_urls = [sitemap_url]
self.extracted_urls = []
def parse(self, response):
print(response.url )
yield None
def run_sitemap_scraper(sitemap_url):
# Run the scraper
process = CrawlerProcess()
process.crawl(SitemapSpider, sitemap_url=sitemap_url)
process.start()
# Example usage
run_sitemap_scraper(".xml")
I am building my first scrapy and am trying to retrieve the urls in the sitemap. The code works, but it seems that scrapy is also already crawling the website urls themselves. I do not want that. How do I get only the urls in the sitemap? Thanks a lot for your advice:
import scrapy
from scrapy.crawler import CrawlerProcess
class SitemapSpider(scrapy.spiders.SitemapSpider):
name = "sitemap_spider"
def __init__(self, sitemap_url, *args, **kwargs):
super(SitemapSpider, self).__init__(*args, **kwargs)
self.sitemap_urls = [sitemap_url]
self.extracted_urls = []
def parse(self, response):
print(response.url )
yield None
def run_sitemap_scraper(sitemap_url):
# Run the scraper
process = CrawlerProcess()
process.crawl(SitemapSpider, sitemap_url=sitemap_url)
process.start()
# Example usage
run_sitemap_scraper("https://ferienparkguide.de/sitemap_index.xml")
Share
Improve this question
asked Mar 25 at 6:51
hal1988hal1988
31 bronze badge
1
|
2 Answers
Reset to default 0I solved my case a bit different in the end. Instead of the def parse above, I used:
# Only follow xml and txt entries
def sitemap_filter(self, entries):
for entry in entries:
print(entry["loc"])
if entry["loc"].endswith(".xml") or entry["loc"].endswith(".txt"):
yield entry
# Do not parse any websites
def parse(self, response):
yield None
Have a look at scrapy.spiders.SitemapSpider._parse_sitemap
SitemapSpider
for your custom script or just do it from scratch. – wRAR Commented Mar 25 at 20:00