Viewing file: InputSource.py (14.52 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
######################################################################## # $Header: /var/local/cvsroot/4Suite/Ft/Xml/InputSource.py,v 1.44 2005/03/19 08:11:01 jkloth Exp $ """ Classes providing a standard interface and encapsulation of metadata for document/entity streams intended for input to various XML processors.
Copyright 2005 Fourthought, Inc. (USA). Detailed license and copyright information: http://4suite.org/COPYRIGHT Project home, documentation, distributions: http://4suite.org/ """
import os, cStringIO, types, warnings, mimetools
from Ft import FtWarning from Ft.Lib import Uri, Uuid
__all__ = ['InputSource', 'NullInputSource', 'InputSourceFactory', 'DefaultFactory', 'NoCatalogFactory']
class InputSource: """ An input source is an encapsulation of a source of content. It includes a stream (Python file-like object) from which the content can be read, a URI to identify the stream and facilitate resolution of relative URI references / system IDs encountered within the stream, and parameters used by the processors of the stream (XML parsers, XSLT processors).
It is designed to be overridden as applications need different functionality from sources. """ def __init__(self, stream, uri=None, processIncludes=True, stripElements=None, factory=None, resolver=Uri.BASIC_RESOLVER, catalog=None, encoding=None): """ stream - the stream associated with this input source uri - the absolute URI of the input source processIncludes - Whether or not XIncludes should be expanded stripElements - Space stripping rules factory - The factory that created this instance resolver - URI resolver; defaults to Ft.Lib.Uri.BASIC_RESOLVER catalog - TR9401/XML Catalog object for resolving public IDs encoding - a string externally declaring the stream's encoding """ if uri: self.uri = uri else: self.uri = 'urn:uuid:' + Uuid.UuidAsString(Uuid.GenerateUuid()) self.stream = stream self.processIncludes = processIncludes self.stripElements = stripElements or [] self.factory = factory self.fragment = Uri.SplitFragment(self.uri)[1] self._resolver = resolver self._catalog = catalog enc = self._getStreamEncoding(stream) if enc is None: enc = encoding self.encoding = enc self.name = self.uri return
def _getStreamEncoding(self, stream): """ Returns the encoding of the given stream, if this info can be determined from metadata in the stream object with a reasonable degree of confidence.
Adheres to RFC 3023, which requires the the charset value in the Content-Type header to take precedence, or if no value is available, to assume us-ascii in the case of certain text/* media types. For other text/* media types, adheres to RFC 2616 sec. 3.7.1, which requires the assumption of iso-8859-1, when the entity was transmitted by HTTP. Media type and charset info is ignored for streams believed to originate from a local file, in accordance with XML 1.0 Third Edition appendix F.2. """ # We should never try to deduce the encoding when the stream is # a local file, in order to conform with XML 1.0 Third Edition # appendix F.2, and also because urllib.urlopen() uses # mimetypes.guess_type() to set the media type on both local # files and FTP resources, thus causing '*.xml' files to tend to # get a 'text/xml' mapping, which is bad because RFC 3023 # requires them to be assumed to be us-ascii. Therefore, we must # look for clues that assure us that the stream is not likely to # be wrapping a file or FTP resource. The way to tell is to look # for the 'url' attribute on the stream object. urllib.urlopen() # MAY create this attribute and set it to the URL that was # passed in. Note that this 'URL' have just been a local # filesystem path or partial URL or junk like 'C:/x/y/z' stream_url = getattr(stream, 'url', None) if stream_url is None: return None scheme = Uri.GetScheme(stream_url) if scheme is None or scheme.lower() in ('file', 'ftp') \ or len(scheme) == 1: return None # Get the stream metadata. # Streams created by urllib.urlopen() MAY have an info() method # that MAY return a mimetools.Message object. We can trust this # as a source of metadata since we have already ruled out the # likelihood of it being a local file or FTP resource. info = None if hasattr(self.stream, 'info'): if isinstance(self.stream.info, types.MethodType): info = self.stream.info() if isinstance(info, mimetools.Message): # use explicit charset if present and not empty string. charset = info.getparam('charset') if charset: return charset # charset empty or not present, so examine media type # and protocol. maintype = getattr(info, 'maintype', None) subtype = getattr(info, 'subtype', None) if maintype == 'text': if subtype == 'xml' or \ subtype == 'xml-external-parsed-entity' or \ subtype.endswith('+xml'): return 'us-ascii' elif scheme == 'http': return 'iso-8859-1' # If we reach this point, the stream metadata was of no use, # so we'll let the parser determine the encoding from # the entity itself. return None
def resolve(self, sysid, pubid=None, hint=None, ignoreErrors=False): """ Resolves a system identifier (fragmentless URI reference) or a public identifier into a new input source. It is typically used when a URI reference is encountered in the original stream and needs to be resolved (e.g. to support an external entity reference, XInclude, xsl:include/import, document(), etc.).
The hint parameter is used to give a hint as to what the resolution will be used for.
The default implementation ignores the hint, and will resolve against a catalog, if one is available.
If the ignoreErrors flag is set, an error during resolution (such as "file not found") will result in None's being returned, rather than a raised exception. """ sysid = self.resolveEntity(pubid, sysid) uri = self._normalize(sysid) stream = self._openStream(uri, ignoreErrors) return self.clone(stream, uri, hint='new stream')
def getUriResolver(self): """ This method returns the URI resolver that is used by this input source to normalize (resolve to absolute form) and resolve (dereference) URI references. This is the public method to use if just URI resolution is needed. """ return self._resolver
#xml.sax.EntityResolver method def resolveEntity(self, pubid, sysid): #FIXME: merge in support for FPI URNs from CatalogInputReader.resolve() if self._catalog: sysid = self._resolveFromCatalog(pubid, sysid) return sysid
#Stream protocol methods def read(self, bufsiz=-1): return self.stream.read(bufsiz)
def readline(self): return self.stream.readline()
def close(self): return self.stream.close()
#Helper Methods def _normalize(self, uriref): """ Normalize (resolve to absolute form) a given URI reference, using the URI of this input source as the base.
The default implementation will just use the default URI resolver.
If your input source is working with non-standard or not supported URIs, then you will need to override this or the getUriResolver method. """ return self.getUriResolver().normalize(uriref, self.uri)
def _openStream(self, uri, ignoreErrors=False): """ Returns a representation of a resource as a stream by resolving the given URI. If ignoreErrors is set, failure to obtain the stream will result in None being returned, rather than an exception (e.g. "file not found") being raised.
Default behaviour is to use the resolver associated with this InputSource. If your custom InputSource needs to open URIs that are not supported natively by this InputSource (e.g., repository objects, or objects from a database), then you should override this method and do whatever it takes to resolve the URI into a readable stream. """ try: return self.getUriResolver().resolve(uri) except: if ignoreErrors: return None raise
def clone(self, stream, uri=None, hint=None): """ Clones this input source, creating a new instance with the known params.
If your derived InputSource requires additional state information then you have to override how it is cloned and pickled. """ if uri is None: uri = self.uri if stream is None: return NullInputSource(uri) if hint == 'new stream': # don't inherit encoding when cloning for self.resolve() enc = None else: enc = self.encoding return self.__class__(stream, uri, processIncludes=self.processIncludes, stripElements=self.stripElements, factory=self.factory, resolver=self._resolver, catalog=self._catalog, encoding=enc)
def _resolveFromCatalog(self, publicId, systemId): ''' Return a new sysid if found through catalog rules, otherwise, return the original sysid ''' if systemId is not None: #FIXME: support URI re-write, delegateSystem and chained catalog #rules
#FIXME (?): the actual lookup algo depends on the purpose #of the lookup (?!). Can we use the hint arg to resolve to #determine this? For one thing, should this function use #self._catalog.uris as well?
#If a system identifier is provided, and at least one #matching system entry exists, return the (absolutized) value #of the uri attribute of the first matching system entry #print (systemId, self._catalog.systemIds[systemId]) if systemId in self._catalog.systemIds: return self._catalog.systemIds[systemId]
#If a public identifier is provided, and at least one #matching public entry exists, the (absolutized) value of the #uri attribute of the first matching public entry is returned. #If a system identifier is also provided as part of the input #to this catalog lookup, only public entries that occur where #the prefer setting is public are considered for matching. if publicId in self._catalog.publicIds: uri, prefer_public = self._catalog.publicIds[publicId] if prefer_public or systemId is None: return uri
return systemId
#Pickle routines. We need to be able to pickle an input source #but cannot pickle a stream def __getstate__(self): state = self.__dict__.copy() state['stream'] = None return state
class NullInputSource(InputSource): """ An InputSource that simulates an empty stream. """ def __init__(self, uri=None): InputSource.__init__(self, cStringIO.StringIO(), uri)
class InputSourceFactory: """ A factory for creating new InputSource instances. """ def __init__(self, inputSourceClass=None, resolver=Uri.BASIC_RESOLVER, catalog=None): self._klass = inputSourceClass or InputSource self.resolver = resolver self.catalog = catalog return
def fromUri(self, uri, *v_args, **kw_args): """ Creates an InputSource from the stream resulting from the resolution of the given URI.
uri - a URI from which the input will be read. Important: a file path is generally not a URI. To be safe, if you wish to read from a file, use the following pattern: from Ft.Lib import Uri uri = Uri.OsPathToUri("/path/to/file.ext") OR uri = Uri.OsPathToUri("C:\\path\\to\\file.ext") """ if self.catalog: uri = self._resolveFromCatalog(uri) src = self.fromStream(*(None, uri) + v_args, **kw_args) stream = src._openStream(src.uri) return src.clone(stream, src.uri)
def fromString(self, st, uri=None, *v_args, **kw_args): """ Creates an InputSource from a stream derived from the given string. The uri argument is the URI to use for the stream (one should always be given, even if it's bogus). """ if not isinstance(st, str): raise ValueError("String must be of type string, not %s" % type(st)) stream = cStringIO.StringIO(st) return self.fromStream(*(stream, uri) + v_args, **kw_args)
def fromStream(self, stream, uri=None, *v_args, **kw_args): """ Creates an InputSource from the given stream. The uri argument is the URI to use for the stream (one should always be given, even if it's bogus). """ if not uri: warnings.warn("Creation of InputSource without a URI", FtWarning, 2) kw_args['factory'] = self if 'resolver' not in kw_args: kw_args['resolver'] = self.resolver if 'catalog' not in kw_args: kw_args['catalog'] = self.catalog return self._klass(*(stream, uri) + v_args, **kw_args)
def _resolveFromCatalog(self, uri): ''' Return a new sysid if found through catalog rules, otherwise, return the original sysid ''' #This is simple URI lookup (e.g. #http://www.oasis-open.org/committees/entity/spec.html#s.uri.ent ) if uri in self.catalog.uris: return self.catalog.uris[uri] return uri
NoCatalogFactory = InputSourceFactory(catalog=None)
from Ft.Xml.Catalog import GetDefaultCatalog DefaultFactory = InputSourceFactory(catalog=GetDefaultCatalog())
|