"""
Deriving link info from sitemaps.
"""
## This file is available from https://github.com/adbar/trafilatura
## under GNU GPL v3 license
import logging
import re
# import urllib.robotparser # Python >= 3.8
# ROBOT_PARSER = urllib.robotparser.RobotFileParser()
from courlan import clean_url, extract_domain, filter_urls, fix_relative_urls, get_hostinfo, lang_filter
from .downloads import fetch_url
from .settings import MAX_SITEMAPS_SEEN
LOGGER = logging.getLogger(__name__)
LINK_REGEX = re.compile(r'<loc>(?:<!\[CDATA\[)?(http.+?)(?:\]\]>)?</loc>')
XHTML_REGEX = re.compile(r'<xhtml:link.+?>', re.DOTALL)
HREFLANG_REGEX = re.compile(r'href=["\'](.+?)["\']')
WHITELISTED_PLATFORMS = re.compile(r'(?:blogger|blogpost|ghost|hubspot|livejournal|medium|typepad|squarespace|tumblr|weebly|wix|wordpress)\.')
SITEMAP_FORMAT = re.compile(r'<\?xml|<sitemap|<urlset')
DETECT_SITEMAP_LINK = re.compile(r'\.xml(\..{2,4})?$|\.xml[?#]')
DETECT_LINKS = re.compile(r'https?://[^\s\r\n]+')
SCRUB_REGEX = re.compile(r'\?.*$|#.*$')
POTENTIAL_SITEMAP = re.compile(r'\.xml\b')
GUESSES = ['sitemap.xml.gz', 'sitemap', 'sitemap_index.xml', 'sitemap_news.xml']
[docs]def sitemap_search(url, target_lang=None):
"""Look for sitemaps for the given URL and gather links.
Args:
url: Webpage or sitemap URL as string.
Triggers URL-based filter if the webpage isn't a homepage.
target_lang: Define a language to filter URLs based on heuristics
(two-letter string, ISO 639-1 format).
Returns:
The extracted links as a list (sorted list of unique links).
"""
domainname, baseurl = get_hostinfo(url)
if domainname is None:
LOGGER.warning('Invalid URL: %s', url)
return []
urlfilter = None
# determine sitemap URL
if url.endswith('.xml') or url.endswith('.gz') or url.endswith('sitemap'):
sitemapurl = url
else:
sitemapurl = baseurl + '/sitemap.xml'
# filter triggered, prepare it
if len(url) > len(baseurl) + 2:
urlfilter = url
sitemapurls, linklist = download_and_process_sitemap(sitemapurl, domainname, baseurl, target_lang)
sitemaps_seen = {sitemapurl}
if sitemapurls == [] and len(linklist) > 0:
linklist = filter_urls(linklist, urlfilter)
LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname)
return linklist
# try sitemaps in robots.txt file if nothing has been found
if sitemapurls == [] and linklist == []:
sitemapurls = find_robots_sitemaps(baseurl)
# try additional URLs just in case
if not sitemapurls:
sitemapurls = [''.join([baseurl, '/', g]) for g in GUESSES]
# iterate through nested sitemaps and results
i = 1
while sitemapurls:
sitemapurl = sitemapurls.pop()
sitemapurls, linklist = download_and_process_sitemap(sitemapurl, domainname, baseurl, target_lang, sitemapurls, linklist)
# sanity check: keep track of visited sitemaps and exclude them
sitemaps_seen.add(sitemapurl)
sitemapurls = [s for s in sitemapurls if s not in sitemaps_seen]
# counter and safeguard
i += 1
if i > MAX_SITEMAPS_SEEN:
break
linklist = filter_urls(linklist, urlfilter)
LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname)
return linklist
def check_sitemap(url, contents):
'''Check if the sitemap corresponds to an expected format,
i.e. TXT or XML.'''
if contents is not None:
# strip query and fragments
url = SCRUB_REGEX.sub('', url)
if POTENTIAL_SITEMAP.search(url) and \
(not isinstance(contents, str) or not SITEMAP_FORMAT.match(contents)) \
or '<html' in contents[:150].lower():
LOGGER.info('not a valid XML sitemap: %s', url)
return None
return contents
def download_and_process_sitemap(url, domain, baseurl, target_lang, sitemapurls=None, linklist=None):
'Helper function chaining download and processing of sitemaps.'
# variables init
sitemapurls, linklist = sitemapurls or [], linklist or []
# fetch and pre-process
LOGGER.info('fetching sitemap: %s', url)
pagecontent = fetch_url(url)
add_sitemaps, add_links = process_sitemap(url, domain, baseurl, pagecontent, target_lang)
return sitemapurls + add_sitemaps, linklist + add_links
def process_sitemap(url, domain, baseurl, pagecontent, target_lang=None):
'Download a sitemap and extract the links it contains.'
contents = check_sitemap(url, pagecontent)
# safeguard
if contents is None:
LOGGER.debug('not a sitemap: %s', url) # respheaders
return [], []
# try to extract links from TXT file
if not SITEMAP_FORMAT.match(contents):
sitemapurls, linklist = [], []
for match in DETECT_LINKS.finditer(contents):
result = match[0]
link, state = handle_link(result, url, domain, baseurl, target_lang)
sitemapurls, linklist = store_sitemap_link(sitemapurls, linklist, link, state)
return sitemapurls, linklist
# process XML sitemap
if target_lang is not None:
sitemapurls, linklist = extract_sitemap_langlinks(contents, url, domain, baseurl, target_lang)
if len(sitemapurls) != 0 or len(linklist) != 0:
return sitemapurls, linklist
return extract_sitemap_links(contents, url, domain, baseurl, target_lang)
def handle_link(link, sitemapurl, domainname, baseurl, target_lang):
'''Examine a link and determine if it's valid and if it leads to
a sitemap or a web page.'''
state = '0'
# safety net: recursivity
if link == sitemapurl:
return link, state
# fix and check
link = fix_relative_urls(baseurl, link)
# clean and normalize
link = clean_url(link, target_lang)
if link is not None and lang_filter(link, target_lang) is True:
newdomain = extract_domain(link, fast=True)
if newdomain is None:
LOGGER.error("Couldn't extract domain: %s", link)
# don't take links from another domain and make an exception for main platforms
# also bypass: subdomains vs. domains
elif newdomain != domainname and not newdomain in domainname and not WHITELISTED_PLATFORMS.search(newdomain):
LOGGER.warning('Diverging domain names: %s %s', domainname, newdomain)
else:
state = 'sitemap' if DETECT_SITEMAP_LINK.search(link) else 'link'
return link, state
def store_sitemap_link(sitemapurls, linklist, link, state):
'''Process link according to filtered result: discard it or store it
in the appropriate list.'''
if state == 'sitemap' and link is not None:
sitemapurls.append(link)
elif state == 'link' and link is not None:
linklist.append(link)
return sitemapurls, linklist
def extract_sitemap_langlinks(pagecontent, sitemapurl, domainname, baseurl, target_lang):
'Extract links corresponding to a given target language.'
if 'hreflang=' not in pagecontent:
return [], []
sitemapurls, linklist = [], []
# compile regex here for modularity and efficiency
lang_regex = re.compile(r"hreflang=[\"']({}.*?|x-default)[\"']".format(target_lang), re.DOTALL)
for attr_match in XHTML_REGEX.finditer(pagecontent):
attributes = attr_match[0]
if lang_regex.search(attributes):
lang_match = HREFLANG_REGEX.search(attributes)
if lang_match:
link, state = handle_link(lang_match[1], sitemapurl, domainname, baseurl, target_lang)
sitemapurls, linklist = store_sitemap_link(sitemapurls, linklist, link, state)
LOGGER.debug('%s sitemaps and %s links with hreflang found for %s', len(sitemapurls), len(linklist), sitemapurl)
return sitemapurls, linklist
def extract_sitemap_links(pagecontent, sitemapurl, domainname, baseurl, target_lang):
'Extract sitemap links and web page links from a sitemap file.'
sitemapurls, linklist = [], []
# extract
for match in LINK_REGEX.finditer(pagecontent):
# process middle part of the match tuple
link, state = handle_link(match[1], sitemapurl, domainname, baseurl, target_lang)
sitemapurls, linklist = store_sitemap_link(sitemapurls, linklist, link, state)
LOGGER.debug('%s sitemaps and %s links found for %s', len(sitemapurls), len(linklist), sitemapurl)
return sitemapurls, linklist
def find_robots_sitemaps(baseurl):
'''Guess the location of the robots.txt file and try to extract
sitemap URLs from it'''
robotstxt = fetch_url(baseurl + '/robots.txt')
return extract_robots_sitemaps(robotstxt, baseurl)
def extract_robots_sitemaps(robotstxt, baseurl):
'Read a robots.txt file and find sitemap links.'
# sanity check on length (cause: redirections)
if robotstxt is None or len(robotstxt) > 10000:
return []
sitemapurls = []
# source: https://github.com/python/cpython/blob/3.8/Lib/urllib/robotparser.py
for line in robotstxt.splitlines():
# remove optional comment and strip line
i = line.find('#')
if i >= 0:
line = line[:i]
line = line.strip()
if not line:
continue
line = line.split(':', 1)
if len(line) == 2:
line[0] = line[0].strip().lower()
if line[0] == "sitemap":
# urllib.parse.unquote(line[1].strip())
candidate = fix_relative_urls(baseurl, line[1].strip())
sitemapurls.append(candidate)
LOGGER.debug('%s sitemaps found in robots.txt', len(sitemapurls))
return sitemapurls