#!/usr/bin/env python """ Extraction parsers for structured data embedded into HTML or XML files. The former may include RDFa or microdata. The syntax and the extraction procedures are based on: * The RDFa specifications: http://www.w3.org/TR/#tr_RDFa * The microdata specification: http://www.w3.org/TR/microdata/ * The specification of the microdata to RDF conversion: http://www.w3.org/TR/microdata-rdf/ License: W3C Software License, http://www.w3.org/Consortium/Legal/copyright-software Author: Ivan Herman Copyright: W3C """ from rdflib.parser import ( Parser, StringInputSource, URLInputSource, FileInputSource) try: import html5lib assert html5lib html5lib = True except ImportError: import warnings warnings.warn( 'html5lib not found! RDFa and Microdata ' + 'parsers will not be available.') html5lib = False def _get_orig_source(source): """ A bit of a hack; the RDFa/microdata parsers need more than what the upper layers of RDFLib provide... This method returns the original source references. """ if isinstance(source, StringInputSource): orig_source = source.getByteStream() elif isinstance(source, URLInputSource): orig_source = source.url elif isinstance(source, FileInputSource): orig_source = source.file.name source.file.close() else: orig_source = source.getByteStream() baseURI = source.getPublicId() return (baseURI, orig_source) def _check_error(graph): from .pyRdfa import RDFA_Error, ns_rdf from .pyRdfa.options import ns_dc for (s, p, o) in graph.triples((None, ns_rdf["type"], RDFA_Error)): for (x, y, msg) in graph.triples((s, ns_dc["description"], None)): raise Exception("RDFa parsing Error! %s" % msg) # This is the parser interface as it would look when called from the # rest of RDFLib class RDFaParser(Parser): """ Wrapper around the RDFa 1.1 parser. For further details on the RDFa 1.1 processing, see the relevant W3C documents at http://www.w3.org/TR/#tr_RDFa. RDFa 1.1 is defined for XHTML, HTML5, SVG and, in general, for any XML language. Note that the parser can also handle RDFa 1.0 if the extra parameter is used and/or the input source uses RDFa 1.0 specific @version or DTD-s. """ def parse(self, source, graph, pgraph=None, media_type="", rdfa_version=None, embedded_rdf=False, space_preserve=True, vocab_expansion=False, vocab_cache=False, refresh_vocab_cache=False, vocab_cache_report=False, check_lite=False): """ @param source: one of the input sources that the RDFLib package defined @type source: InputSource class instance @param graph: target graph for the triples; output graph, in RDFa spec. parlance @type graph: RDFLib Graph @keyword pgraph: target for error and warning triples; processor graph, in RDFa spec. parlance. If set to None, these triples are ignored @type pgraph: RDFLib Graph @keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source. None means the content type of the HTTP result is used, or a guess is made based on the suffix of a file @type media_type: string @keyword rdfa_version: 1.0 or 1.1. If the value is "", then, by default, 1.1 is used unless the source has explicit signals to use 1.0 (e.g., using a @version attribute, using a DTD set up for 1.0, etc) @type rdfa_version: string @keyword embedded_rdf: some formats allow embedding RDF in other formats: (X)HTML can contain turtle in a special