"""
Module bundling all functions needed to scrape metadata from webpages.
"""
import json
import logging
import re
from copy import deepcopy
from courlan.clean import normalize_url
from courlan.core import extract_domain
from courlan.filters import validate_url
from htmldate import find_date
from lxml import html
from .json_metadata import extract_json, extract_json_parse_error
from .metaxpaths import author_xpaths, categories_xpaths, tags_xpaths, title_xpaths, author_discard_xpaths
from .utils import line_processing, load_html, normalize_authors, trim, check_authors
from .htmlprocessing import prune_unwanted_nodes
LOGGER = logging.getLogger(__name__)
logging.getLogger('htmldate').setLevel(logging.WARNING)
class Document:
"Defines a class to store all necessary data and metadata fields for extracted information."
__slots__ = [
'title', 'author', 'url', 'hostname', 'description', 'sitename',
'date', 'categories', 'tags', 'fingerprint', 'id', 'license',
'body', 'comments', 'commentsbody', 'raw_text', 'text'
]
# consider dataclasses for Python 3.7+
def __init__(self):
for slot in self.__slots__:
setattr(self, slot, None)
def trim_data(self):
'Limit text length and trim the attributes.'
for slot in self.__slots__:
value = getattr(self, slot)
if isinstance(value, str):
# length
if len(value) > 10000:
new_value = value[:9999] + '…'
setattr(self, slot, new_value)
value = new_value
# text content: remove spaces and control characters
setattr(self, slot, line_processing(value))
HTMLDATE_CONFIG_FAST = {'extensive_search': False, 'original_date': True}
HTMLDATE_CONFIG_EXTENSIVE = {'extensive_search': True, 'original_date': True}
JSON_MINIFY = re.compile(r'("(?:\\"|[^"])*")|\s')
HTMLTITLE_REGEX = re.compile(r'^(.+)?\s+[-|]\s+(.+)$') # part without dots?
URL_COMP_CHECK = re.compile(r'https?://|/')
HTML_STRIP_TAG = re.compile(r'(<!--.*?-->|<[^>]*>)')
LICENSE_REGEX = re.compile(r'/(by-nc-nd|by-nc-sa|by-nc|by-nd|by-sa|by|zero)/([1-9]\.[0-9])')
TEXT_LICENSE_REGEX = re.compile(r'(cc|creative commons) (by-nc-nd|by-nc-sa|by-nc|by-nd|by-sa|by|zero) ?([1-9]\.[0-9])?', re.I)
METANAME_AUTHOR = {
'author', 'byl', 'citation_author', 'dc.creator', 'dc.creator.aut',
'dc:creator',
'dcterms.creator', 'dcterms.creator.aut', 'parsely-author',
'sailthru.author', 'shareaholic:article_author_name'
} # questionable: twitter:creator
METANAME_DESCRIPTION = {
'dc.description', 'dc:description',
'dcterms.abstract', 'dcterms.description',
'description', 'sailthru.description', 'twitter:description'
}
METANAME_PUBLISHER = {
'citation_journal_title', 'copyright', 'dc.publisher',
'dc:publisher', 'dcterms.publisher', 'publisher'
} # questionable: citation_publisher
METANAME_TAG = {
'citation_keywords', 'dcterms.subject', 'keywords', 'parsely-tags',
'shareaholic:keywords', 'tags'
}
METANAME_TITLE = {
'citation_title', 'dc.title', 'dcterms.title', 'fb_title',
'parsely-title', 'sailthru.title', 'shareaholic:title',
'title', 'twitter:title'
}
OG_AUTHOR = {'og:author', 'og:article:author'}
PROPERTY_AUTHOR = {'author', 'article:author'}
TWITTER_ATTRS = {'twitter:site', 'application-name'}
EXTRA_META = {'charset', 'http-equiv', 'property'}
def extract_meta_json(tree, metadata):
'''Parse and extract metadata from JSON-LD data'''
for elem in tree.xpath('.//script[@type="application/ld+json" or @type="application/settings+json"]'):
if not elem.text:
continue
element_text = JSON_MINIFY.sub(r'\1', elem.text)
try:
schema = json.loads(element_text)
metadata = extract_json(schema, metadata)
except json.JSONDecodeError:
metadata = extract_json_parse_error(element_text, metadata)
return metadata
def extract_opengraph(tree):
'''Search meta tags following the OpenGraph guidelines (https://ogp.me/)'''
title, author, url, description, site_name = (None,) * 5
# detect OpenGraph schema
for elem in tree.xpath('.//head/meta[starts-with(@property, "og:")]'):
# safeguard
if not elem.get('content'):
continue
# site name
if elem.get('property') == 'og:site_name':
site_name = elem.get('content')
# blog title
elif elem.get('property') == 'og:title':
title = elem.get('content')
# orig URL
elif elem.get('property') == 'og:url':
if validate_url(elem.get('content'))[0] is True:
url = elem.get('content')
# description
elif elem.get('property') == 'og:description':
description = elem.get('content')
# og:author
elif elem.get('property') in OG_AUTHOR:
author = elem.get('content')
# og:type
# elif elem.get('property') == 'og:type':
# pagetype = elem.get('content')
# og:locale
# elif elem.get('property') == 'og:locale':
# pagelocale = elem.get('content')
return trim(title), trim(author), trim(url), trim(description), trim(site_name)
def examine_meta(tree):
'''Search meta tags for relevant information'''
metadata = Document() # alt: Metadata()
# bootstrap from potential OpenGraph tags
title, author, url, description, site_name = extract_opengraph(tree)
# test if all return values have been assigned
if all((title, author, url, description, site_name)): # if they are all defined
metadata.title, metadata.author, metadata.url, metadata.description, metadata.sitename = title, author, url, description, site_name
return metadata
tags, backup_sitename = [], None
# skim through meta tags
for elem in tree.iterfind('.//head/meta[@content]'):
# content
if not elem.get('content'):
continue
content_attr = HTML_STRIP_TAG.sub('', elem.get('content'))
# image info
# ...
# property
if 'property' in elem.attrib:
# no opengraph a second time
if elem.get('property').startswith('og:'):
continue
if elem.get('property') == 'article:tag':
tags.append(content_attr)
elif elem.get('property') in PROPERTY_AUTHOR:
author = normalize_authors(author, content_attr)
# name attribute
elif 'name' in elem.attrib:
name_attr = elem.get('name').lower()
# author
if name_attr in METANAME_AUTHOR:
author = normalize_authors(author, content_attr)
# title
elif name_attr in METANAME_TITLE:
title = title or content_attr
# description
elif name_attr in METANAME_DESCRIPTION:
description = description or content_attr
# site name
elif name_attr in METANAME_PUBLISHER:
site_name = site_name or content_attr
elif name_attr in TWITTER_ATTRS or 'twitter:app:name' in elem.get('name'):
backup_sitename = content_attr
# url
elif name_attr == 'twitter:url':
if url is None and validate_url(content_attr)[0] is True:
url = content_attr
# keywords
elif name_attr in METANAME_TAG: # 'page-topic'
tags.append(content_attr)
elif 'itemprop' in elem.attrib:
if elem.get('itemprop') == 'author':
author = normalize_authors(author, content_attr)
elif elem.get('itemprop') == 'description':
description = description or content_attr
elif elem.get('itemprop') == 'headline':
title = title or content_attr
# to verify:
# elif elem.get('itemprop') == 'name':
# if title is None:
# title = elem.get('content')
# other types
elif all(
key not in elem.attrib
for key in EXTRA_META
):
LOGGER.debug('unknown attribute: %s',
html.tostring(elem, pretty_print=False, encoding='unicode').strip())
# backups
if site_name is None and backup_sitename is not None:
site_name = backup_sitename
# copy
metadata.title, metadata.author, metadata.url, metadata.description, metadata.sitename, metadata.tags = title, author, url, description, site_name, tags
return metadata
def extract_metainfo(tree, expressions, len_limit=200):
'''Extract meta information'''
# try all XPath expressions
for expression in expressions:
# examine all results
i = 0
for elem in tree.xpath(expression):
content = trim(' '.join(elem.itertext()))
if content and 2 < len(content) < len_limit:
# LOGGER.debug('metadata found in: %s', expression)
return content
i += 1
if i > 1:
LOGGER.debug('more than one invalid result: %s %s', expression, i)
return None
def extract_title(tree):
'''Extract the document title'''
# only one h1-element: take it
h1_results = tree.xpath('//h1')
if len(h1_results) == 1:
title = trim(h1_results[0].text_content())
if len(title) > 0:
return title
# extract using x-paths
title = extract_metainfo(tree, title_xpaths)
if title is not None:
return title
# extract using title tag
try:
title = trim(tree.xpath('//head/title')[0].text_content())
# refine
mymatch = HTMLTITLE_REGEX.match(title)
if mymatch:
if '.' not in mymatch.group(1):
title = mymatch.group(1)
elif '.' not in mymatch.group(2):
title = mymatch.group(2)
return title
except IndexError:
LOGGER.warning('no main title found')
# take first h1-title
if h1_results:
return h1_results[0].text_content()
# take first h2-title
try:
title = tree.xpath('//h2')[0].text_content()
except IndexError:
LOGGER.warning('no h2 title found')
return title
def extract_author(tree):
'''Extract the document author(s)'''
subtree = prune_unwanted_nodes(deepcopy(tree), author_discard_xpaths)
author = extract_metainfo(subtree, author_xpaths, len_limit=120)
if author:
author = normalize_authors(None, author)
return author
def extract_url(tree, default_url=None):
'''Extract the URL from the canonical link'''
# https://www.tutorialrepublic.com/html-reference/html-base-tag.php
# default url as fallback
url = default_url
# try canonical link first
element = tree.find('.//head//link[@rel="canonical"]')
if element is not None and 'href' in element.attrib and URL_COMP_CHECK.match(element.attrib['href']):
url = element.attrib['href']
# try default language link
else:
for element in tree.iterfind('.//head//link[@rel="alternate"]'):
if (
'hreflang' in element.attrib
and element.attrib['hreflang'] is not None
and element.attrib['hreflang'] == 'x-default'
and URL_COMP_CHECK.match(element.attrib['href'])
):
LOGGER.debug(html.tostring(element, pretty_print=False, encoding='unicode').strip())
url = element.attrib['href']
# add domain name if it's missing
if url is not None and url.startswith('/'):
for element in tree.iterfind('.//head//meta[@content]'):
if 'name' in element.attrib:
attrtype = element.attrib['name']
elif 'property' in element.attrib:
attrtype = element.attrib['property']
else:
continue
if attrtype.startswith('og:') or attrtype.startswith('twitter:'):
domain_match = re.match(r'https?://[^/]+', element.attrib['content'])
if domain_match:
# prepend URL
url = domain_match.group(0) + url
break
# sanity check: don't return invalid URLs
if url is not None:
validation_result, parsed_url = validate_url(url)
if validation_result is False:
url = None
else:
url = normalize_url(parsed_url)
# suggested:
# url = None if validation_result is False else normalize_url(parsed_url)
return url
def extract_sitename(tree):
'''Extract the name of a site from the main title (if it exists)'''
title_elem = tree.find('.//head/title')
if title_elem is not None and title_elem.text is not None:
mymatch = HTMLTITLE_REGEX.match(title_elem.text)
if mymatch:
if '.' in mymatch.group(1):
return mymatch.group(1)
if '.' in mymatch.group(2):
return mymatch.group(2)
return None
def extract_catstags(metatype, tree):
'''Find category and tag information'''
results = []
regexpr = '/' + metatype + '[s|ies]?/'
if metatype == 'category':
xpath_expression = categories_xpaths
else:
xpath_expression = tags_xpaths
# suggested:
# xpath_expression = categories_xpaths if metatype == 'category' else tags_xpaths
# search using custom expressions
for catexpr in xpath_expression:
for elem in tree.xpath(catexpr):
if 'href' in elem.attrib and re.search(regexpr, elem.attrib['href']):
results.append(elem.text_content())
if results:
break
# category fallback
if metatype == 'category' and not results:
element = tree.find('.//head//meta[@property="article:section"]')
if element is not None and 'content' in element.attrib:
results.append(element.attrib['content'])
results = [line_processing(x) for x in results if x is not None]
return [x for x in results if x is not None]
def parse_license_element(element, strict=False):
'''Probe a link for identifiable free license cues.
Parse the href attribute first and then the link text.'''
if element.get('href') is not None:
# look for Creative Commons elements
match = LICENSE_REGEX.search(element.get('href'))
if match:
return 'CC ' + match.group(1).upper() + ' ' + match.group(2)
if element.text is not None:
# just return the anchor text without further ado
if strict is False:
return trim(element.text)
# else: check if it could be a CC license
match = TEXT_LICENSE_REGEX.search(element.text)
if match:
return match.group(0)
return None
def extract_license(tree):
'''Search the HTML code for license information and parse it.'''
result = None
# look for links labeled as license
for element in tree.xpath('//a[@rel="license"]'):
result = parse_license_element(element, strict=False)
if result is not None:
break
# probe footer elements for CC links
if result is None:
for element in tree.xpath(
'//footer//a|//div[contains(@class, "footer") or contains(@id, "footer")]//a'
):
result = parse_license_element(element, strict=True)
if result is not None:
break
return result