Skip to content

ReportLab Docs

XML helper utilities

ReportLab's XML Tools

We've been around for a very long time. When we got started with RML in 2001, we rapidly found a need for a lot of markup processing tools, and Python did not have the standard solutions it has today.

This page documents some of the functions in the rlextra.radxml package which you might find useful.

Sanitising markup, and converting to RML

functions in html_cleaner

Here are some important clean functions within rlextra.

`Cleaner`

Bases: HTMLParser

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

class Cleaner(HTMLParser):

    def __init__(self,
                target="block",
                breaksAllowed=True,
                stripComments=False,
                stripUnknownEntities=True,
                allowImages=True,
                allowTables=True,
                allowAtags=True,
                allowStyleAttr=True, #passed through to para and headings
                allowAlignAttr=True, #passed through to para and headings  
                aHrefTr=None,
                imgSrcTr=None,
                substitutions=[],
                maxLines=None,
                lineWidth=40,
                entities = known_entities.keys(),
                encoding = None,
                special_attr_pfxs=['rl-'],
                ):
        """Initialising defines your language options.
        You can re-use the same parser many times.
        if breaksAllowed, they will be written to output.
        if not, in inline mode they vanish, and in block
        mode they end the block.

        substitutions is a a singleton or list containing
            pat         pat --> ''
            (pat,str)   pat --> str
            callable    c(src) --> src

        pat may be a str or a compiled pattern. These substitutions
        are done before parsing.
        """
        target = self.asUnicode(target)
        self.stripUnknownEntities = stripUnknownEntities
        self.allowImages = allowImages
        self.allowTables = allowTables
        self.allowAtags = allowAtags
        self.aHrefTr = aHrefTr
        self.imgSrcTr = imgSrcTr
        self.encoding = encoding
        self.allowAlignAttr = allowAlignAttr
        self.allowStyleAttr = allowStyleAttr
        self.special_attr = re.compile('^(?:%s)' % '|'.join(special_attr_pfxs)).match
        self._setupGrammar()

        assert target in (u"block", u"inline"), "unexpected block '%s', must be 'block' or 'inline'" % target
        self.target = target
        HTMLParser.__init__(self,
            **(dict(convert_charrefs=False) if sys.version_info>=(3,4) else {}))
        self.breaksAllowed = breaksAllowed
        self.stripComments = stripComments
        self.entities = set(entities).union(('lt', 'gt', 'amp'))

        #prefix up the substitutions list
        if not isinstance(substitutions,(list,tuple)):
            substitutions = (substitutions,)
        S=[].append
        for s in substitutions:
            if isinstance(s,strTypes):
                s = lambda x,pat=re.compile(asUnicode(s)): pat.sub('',x)
            elif hasattr(s,'sub'):
                s = lambda x,pat=s: pat.sub('',x)
            elif isinstance(s,(tuple,list)) and len(s)==2:
                p=s[0]
                if isinstance(p,str):
                    s = lambda x,pat=re.compile(p),s=s[1]: pat.sub(s,x)
                elif hasattr(p,'sub'):
                    s = lambda x,pat=p,s=s[1]: pat.sub(s,x)
                else:
                    raise ValueError('Invalid value %r in substitions list' % s)
            elif not callable(s):
                raise ValueError('Invalid value %r in substitions list' % s)
            S(s)
        self.substitutions = S.__self__
        self.remainingLines = maxLines
        self.lineWidth = lineWidth
        self.textLength = 0

    def _setupGrammar(self):

        self.DEFAULT_BLOCK_TAG = 'p'    #used to close blocks if nothing else given
        def uniDict(d):
            r = {}
            for k in d:
                r[asUnicode(k)] = d[k]
            return r

        def byName(listOfDicts):
            out = {}
            for d in listOfDicts:
                d = uniDict(d)
                out[d['name']] = d
            return out

        #situations in which a tag can appear.
        self._contextDict = byName([
            dict(name='block', allowedIn=None),  #None=top-level
            dict(name='inline', allowedIn='block'),
            dict(name='table', allowedIn='block'),
            dict(name='list', allowedIn='block'),
            dict(name='li', allowedIn='list'),
            dict(name='tr', allowedIn='table'),
            dict(name='td', allowedIn='tr'),    #no contents for now
            ])
#        if not self.allowTables:
#            del self._contextDict['table']

        for context in self._contextDict.values():
            context['canContain'] = set()

        allowParaAttrs = []
        if self.allowStyleAttr: allowParaAttrs.append('style')
        if self.allowAlignAttr: allowParaAttrs.append('align')

        self._tagDict = byName([
            dict(name='p',context='block', attrs=allowParaAttrs),
            dict(name='h1',context='block', attrs=allowParaAttrs),
            dict(name='h2',context='block', attrs=allowParaAttrs),
            dict(name='h3',context='block', attrs=allowParaAttrs),
            dict(name='h4',context='block', attrs=allowParaAttrs),
            dict(name='h5',context='block', attrs=allowParaAttrs),
            dict(name='h6',context='block', attrs=allowParaAttrs),

            dict(name='strong',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='em',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='i',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='b',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='u',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='sup',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='sub',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='br',context=('inline', 'li', 'td'), attrs=[], selfClosing=True),
            dict(name='a',context=('inline', 'li', 'td'), attrs=['href']),

            # force_attrs. These attributes will be added to make sure xhtml validation passes.
            dict(name='img',context=('inline', 'li'), attrs=['src','width','height','alt'], force_attrs=['alt'], selfClosing=True),

            dict(name='table',context=('block','td'), attrs=[]),
            dict(name='tr',context='table', attrs=[]),
            dict(name='td',context='tr', attrs=[]),
            dict(name='th',context='tr', attrs=[]),

            dict(name='ul',context=('block','td', 'li'), attrs=[]),
            dict(name='ol',context=('block','td', 'li'), attrs=[]),
            dict(name='li',context='list', attrs=[]),
            ])

        # Tags to use to cover naked text up with in the given context
        self._dataCover = uniDict(dict(
            block=(self.DEFAULT_BLOCK_TAG,),
            table=('tr', 'td'),
            tr=('td',),
            #td=('p',),
            list=('li',),
            ))

#        if not self.allowTables:
#            del self._tagDict['table']
#            del self._tagDict['tr']
#            del self._tagDict['td']

        if not self.allowImages:
            del self._tagDict['img']
        if not self.allowAtags:
            del self._tagDict['a']

        #work out in-place the set of tags allowed in each context.

        for tagName,tag in self._tagDict.items():
            if 'selfClosing' not in tag:
                tag['selfClosing'] = False

            contexts = tag['context']
            if not isSeq(contexts):
                contexts = contexts,
            for ctxName in contexts:
                context = self._contextDict.get(ctxName)
                context['canContain'].add(tagName)

        #work out certain useful attributes
        self.valid_tags = set(self._tagDict.keys())
        self.valid_block_tags = self.tagsAllowedInContext('block')
        self.valid_inline_tags = self.tagsAllowedInContext('inline')
        self.valid_other_tags = self.valid_tags - self.valid_block_tags - self.valid_inline_tags

    def allowedAttrs(self, tagName):
        """Return set of allowed attributes for the tag"""
        return self._tagDict[tagName]['attrs']

    def forcedAttrs(self, tagName):
        """Return set of forced attributes for the tag"""
        if 'force_attrs' in self._tagDict[tagName]:
            return self._tagDict[tagName]['force_attrs']
        else:
            return None

    def getContext(self, tag):
        """Return main context for tag

        >>> g = Cleaner()
        >>> eqCheck(g.getContext('i'),'inline')
        >>> eqCheck(g.getContext('li'),'list')
        """
        context = self._tagDict[tag]['context']
        if isSeq(context):
            return context[0]
        return context

    def context(self):
        if self.openTagStack:
            c = self.openTagStack[-1][1]
        else:
            c = self.target == 'block' and 'block' or 'inline'
        return c
    context=property(context)

    def isTagAllowedInContext(self, tag, context):
        """Is the tag allowed here?

        >>> g = Cleaner()
        >>> g.isTagAllowedInContext('b','block')
        False
        >>> g.isTagAllowedInContext('a','inline')
        True
        """
        return context in self._tagDict[tag]['context']

    def tagsAllowedInContext(self, context):
        """Set of tag names allowed in this context

        >>> g = Cleaner()
        >>> eqCheck(g.tagsAllowedInContext('table'),set(['tr']))
        >>> eqCheck(g.tagsAllowedInContext('inline'),set(['em', 'a', 'b', 'sub', 'img', 'i', '', 'br', 'sup', 'strong','u']))
        """
        #special case - extreme table removal!
        if context == 'table' and not self.allowTables:
            return []

        return self._contextDict[context]['canContain']

    def reset(self):
        "get ready to do some work"
        HTMLParser.reset(self)
        self.buf = []   #holds output
        self.fixes = []  #holds warnings / debug messages / fixups done
        self.openTagStack = []      #checks for balancing
        self._started = False
        self._currentBlockTag = None   #what kind of block tag are we inside?  Usually <p>
        self._justAfterEntity = False   #flag to say if the last thing we saw was an entity.  Used to detect doubled entities in input

    def close(self):
        "Final tidyups"
        HTMLParser.close(self)
        self.cleanupClosingTags()

    def process(self, markup):
        "The main loop - call this with your markup"
        markup = self.asUnicode(markup)
        for s in self.substitutions:
            markup=s(markup)
        markup = re.sub('<([A-Za-z]+\\w*)/>', '<\\1 />', markup)
        markup = nakedAmpFix(markup.strip())
        self.reset()
        markup = self.asUnicode(markup)
        self.feed(markup)
        self.close()
        r = ''.join(self.buf)
        return r.encode(self.encoding) if self.encoding else r

    def dump(self):
        print(''.join(self.buf))

    def asUnicode(self, markup):
        """convert to unicode"""
        #TODO
        if not isUnicode(markup):
            try:
                markup = markup.decode('utf8', 'strict')
            except UnicodeDecodeError:
                #assume windows encoding
                markup = markup.decode('cp1252', 'replace')
        return markup

    def writeStartTag(self, tag, attrs={}):
        """Helper to do what it says.  Called to write a tag to output.

        Never write your own tags to output; instead call this.  This will
        maintain a stack and ensure they are balanced. It also sets the
        mode every time for you."""
        #for table removal, we just don't write it out. It's easier
        #to have writeStartTag called (from several places) because we
        #need to keep track of the fact that we are in a table-to-be-removed.
        if self.remainingLines != None and self.remainingLines <= 0:
            return
        if tag == 'table' and not self.allowTables:
            self.openTagStack.append((tag,'table'))
            return

        #self.dump()
        #prefilter to remove all block tags in inline markup

        if tag not in self.valid_inline_tags:
            if self.target == 'inline':
                return

        adict = dict(attrs)

        if tag == 'img' and self.imgSrcTr:
            if isinstance(self.imgSrcTr, str):
                p = os.path.join(self.imgSrcTr, os.path.split(adict['src'])[-1])
                p = p.replace('\\', '/')
                adict['src'] = p
            else:
                adict['src'] = self.imgSrcTr(adict['src'])

        if tag == 'a' and self.aHrefTr:
            href = adict['href']
            if isinstance(self.aHrefTr, str):
                if not (href.startswith('http://') or href.startswith('https://')):
                    adict['href'] = self.aHrefTr.rstrip('/') + '/' + href.lstrip('/')
            else:
                adict['href'] = self.aHrefTr(href)

        attrs = [ (k, adict[k]) for k, _ in attrs ]

        allowedAttrs = self.allowedAttrs(tag)
        forcedAttrs = self.forcedAttrs(tag)
        selfClosing = self._tagDict[tag]['selfClosing']
        #if selfClosing: print "found self-closing tag %s" % tag

        #rebuild the tag as a piece of text
        tagBits = ['<']
        tagBits.append(tag)
        for k, v in attrs:
            if (k in allowedAttrs or self.special_attr(k)) and v is not None:
                v = self.asUnicode(v)
                if k[0:2].lower() != 'on' and v[0:10].lower() != 'javascript':
                    tagBits.append(' %s="%s"' % (k, v))

        # If there are any forced attributes
        if forcedAttrs and len(forcedAttrs) > 0:
            tag_attrs = [k for k,v in attrs]
            for k in forcedAttrs:
                if k not in tag_attrs:
                    tagBits.append(' %s=""'% k)
        if selfClosing:
            tagBits.append('/>')
        else:
            tagBits.append('>')
        tagText = ''.join(tagBits)

        self.buf.append(tagText)

        #and put it on the stack....
        if not selfClosing:
            context = self.context  #current context
            #if block, remember how to close
            if context == 'block':
                self._currentBlockTag = (tag,'block')
            #set the mode
            if tag == 'table':
                ncontext = 'table'
            elif tag == 'tr':
                ncontext = 'tr'
            elif tag in ('td', 'th'):
                ncontext = 'td'
            elif tag in ('ul', 'ol'):
                ncontext = 'list'
            elif tag == 'li':
                ncontext = 'li'
            else:
                #block and inline always lead to inline
                ncontext = 'inline'
            self.openTagStack.append((tag,ncontext))

    def writeEndTag(self, tag):
        """Close the tag, but check for nesting errors.

        Never write to the buffer directly; this keeps the stack
        and mode organised."""
        if not self.enoughSpace(tag):
            return
        try:
            lastTag,lastContext = self.openTagStack.pop()
        except:
            print(self.openTagStack)
            raise
        if tag == 'table':
            if not self.allowTables:
                return
            if not self.currentTableHasContent():
                #remove everything inside the present table
                while True:
                    popped = self.buf.pop()
                    if popped.startswith('<table'):
                        break
                return

        #prefilter to remove all block tags in inline markup
        if self.target == 'inline':
            if tag not in self.valid_inline_tags:
                return
        if lastTag != tag:
            raise ValueError("stack is messed up trying to close %s; current open tag was %s" % (tag, lastTag))
        self.buf.append('</%s>' % tag)

    def pendingTag(self):
        "What tag is waiting?"
        try:
            return self.openTagStack[-1][0]
        except IndexError:
            return None

    def atStart(self):
        return (len(self.buf) == 0)

    def discardTag(self, tag):
        'Remove everything inside this tag from the stack and buffer'
        while self.openTagStack:
            ctag,ctxt = self.openTagStack.pop()
            n = len(self.buf) - 1
            while n:
                if self.buf[n].startswith('<' + ctag):
                    break
                n -= 1
            self.buf = self.buf[:n]
            if ctag == tag:
                return


    def currentTableHasContent(self):
        'backtrack to last <table> and see if we have any actual non-whitespace content'
        pointer = -1
        item = self.buf[pointer]
        while not item.startswith('<table'):
            #print pointer, item
            pointer -= 1
            item = self.buf[pointer]
            if not item.startswith('<'):
                if item.strip():
                    return True
        return False

    def enoughSpace(self, tag):
        '''
        Tries to determine if the text in the current tag will fit on the remaining number of lines.
        If it does, this method returns True.
        If not, it will discard the text of the current tag and return False.
        '''
        if self.remainingLines == None:
            return True
        if tag == 'p':
            lines = float(self.textLength) / self.lineWidth
        elif tag == 'li':
            # Count 3 characters for the bullet point
            lines = float(self.textLength) / (self.lineWidth - 3)
        else:
            return True
        import math
        lines = int(math.ceil(lines))
        self.remainingLines -= lines
        if self.remainingLines >= 0:
            self.textLength = 0
            return True
        self.discardTag(tag)
        return False

    def writeData(self, text):
        "Used to write out non-tag content"
        if self.remainingLines != None and self.remainingLines <= 0:
            return
        self.textLength += len(text)
        self.buf.append(text)

    def closeCurrentBlock(self):
        """This is used to close any pending inline tags, whenever
        we hit a new block start,  Healthy closing never calls this."""
        if self.atStart():
            return
        if self.target == 'inline':
            return #write nothing
        tag = self.pendingTag()
        if tag is not None:
            while tag in self.valid_inline_tags:
                self.writeEndTag(tag)
                tag = self.pendingTag()
        assert self._currentBlockTag is not None

        #if there are any more end-tags in the stack, chuck them.
        self.openTagStack = [self._currentBlockTag]
        self.writeEndTag(self._currentBlockTag[0])

    def tagInStack(self,tag):
        stack = self.openTagStack
        x = len(stack)
        while x:
            x -= 1
            if tag==stack[x][0]: return True
        return False

    def handle_data(self, data):
        data = data.replace('<', '&lt;').replace('>', '&gt;')

        if not data.strip():
            self.writeData(data)
            return

        if not self.allowTables:
            if self.tagInStack('table'):
                return

        #are we in the right mode?
        tags = self._dataCover.get(self.context,[])
        for t in tags:
            self.writeStartTag(t)

        self.writeData(data)

    def handle_comment(self, name):
        if not self.stripComments:
            self.buf.append('<!--' + self.asUnicode(name) + '-->')

    def handle_entityref(self, name):
        "Handles a named entity.  "
        if self.stripUnknownEntities and not name in self.entities:
            return ''
        self.handle_data('&%s;' % name)

    def handle_charref(self,name):
        self.handle_data('&#%s;' % name)

    def handle_starttag(self, tag, attrs):
        """ Delete all tags except for legal ones, and strip some obvious javascript

        The 'state machine' choices are all here, and in unknown_endtag. At any point,
        we know the current context, and the next tag which has its own context. If
        the next tag is expected, fine.  If not, we need to handle each state
        transition intelligently.

        """
        tag = tag.lower()

        #remove ANYTHING inside a table, if removing tables.
        if not self.allowTables:
            if self.tagInStack('table'):
                return

        if tag=='br':
            """Handles the <br/> tag.

            Called directly by sgmllib instead of unknown_starttag,
            because it's a singleton tag.  What we do depends on whether
            (a) breaks are allowed in this normalisation, and
            (b) we are inside a block or between them.

            As presently implemented, with breaksAllowed=False...
               <p>one<br/>two</p>   ->  <p>one</p><p>two</p>

            ..and multiple <br> tags beyond the first will create
            extra empty paragraphs.
            """
            if self.remainingLines != None and self.remainingLines <= 0:
                return

            if self.breaksAllowed:
                if self.isTagAllowedInContext('br', self.context):
                    self.buf.append(u"<br/>")
                    if self.remainingLines != None:
                        self.remainingLines -= 1
            else:
                if self.target == 'block':
                    if self.context == 'inline':
                        self.closeCurrentBlock()
                    elif self.context == 'block':
                        #give them an empty para for each extra <br> they added
                        self.writeStartTag(self.DEFAULT_BLOCK_TAG)
                        self.writeEndTag(self.DEFAULT_BLOCK_TAG)
                else:
                    self.buf.append(' ')  #if they had two lines in the input,
                    #at least a space should separate them in the output.
        elif tag in self.valid_tags:
            context = self.context
            if tag in self.tagsAllowedInContext(context):
                #expected, write it out.  The writer will filter out any unexpected attributes.
                self.writeStartTag(tag, attrs)
            else:
                #Unexpected.  Each context combination has its own rules.
                #We have 6 contexts so 36 combinations, but in most we
                #just want to ignore the tag.
                nextContext = self.getContext(tag)

                if context == 'inline':
                    if nextContext == 'block':
                        self.closeCurrentBlock()
                        self.writeStartTag(tag, attrs)
                    else:
                        pass   #if we get a naked tr, td, li, we'll just ignore it.
                elif context == 'block':
                    if nextContext == 'inline':  #e.g. <i> at start of document
                        self.writeStartTag(self.DEFAULT_BLOCK_TAG)
                        self.writeStartTag(tag, attrs)
                    elif nextContext in ('table', 'tr', 'list'):
                        #very out-of-context tag, ignore it. e.g. <p><tr>
                        pass
                elif context == 'table':
                    #anything but a tr or td is disallowed
                    if nextContext == 'tr':  #i.e. tag is a td
                        #they forgot the tr, repair it
                        self.writeStartTag('tr', {})
                        self.writeStartTag(tag, attrs)
                    else:
                        pass
                elif context == 'tr':
                    #expected a td, anything higher up means we need to close
                    if nextContext == 'table':
                        #got a tr - close the current tr
                        self.writeEndTag('tr')
                        self.writeStartTag(tag, attrs)
                    elif nextContext == 'block':
                        #close the whole table
                        self.writeEndTag('tr')
                        self.writeEndTag('table')
                        self.writeStartTag(tag, attrs)
                    else:
                        pass
                elif context == 'td':
                    #brutal for now, no tags allowed here yet, but
                    #should be like inline.
                    if nextContext == 'table':
                        #got a tr - close the td
                        self.writeEndTag('td')
                        self.writeEndTag('tr')
                        self.writeStartTag(tag, attrs)
                    #elif nextContext == 'block':
                    #    pass
                    else:
                        pass
                elif context == 'list':  #tag is li
                    if nextContext == 'list':   #got another li, close the last li
                        self.writeEndTag('li')
                        self.writeStartTag(tag, attrs)
                    elif nextContext == 'block':
                        self.closeCurrentBlock()
                        self.writeStartTag(tag, attrs)
                    else:
                        pass
                elif context == 'li':   #tag is li
                    if nextContext == 'list':   #got another li, close the last li
                        self.writeEndTag('li')
                        self.writeStartTag(tag, attrs)
                    elif nextContext == 'block':
                        self.writeEndTag('li')
                        self.closeCurrentBlock()
                        self.writeStartTag(tag, attrs)
                    else:
                        pass
                else:
                    raise ValueError("unexpected context '%s'" % context)

    def handle_endtag(self, tag):
        tag = tag.lower()
        if tag in self.valid_tags:
            pending = self.pendingTag()
            if tag == pending:
                #all is wonderful, close it
                self.writeEndTag(tag)
            else:
                #normally, we just ignore unexpected end tags.
                #the stack will be closed at the end.  However
                #if we get a </table>, </ul> etc, it may make
                #sense to close.
                if tag == 'tr':
                    if pending == 'td':
                        #close it
                        self.writeEndTag('td')
                        self.writeEndTag(tag)
                elif tag in ('ul','ol'):
                    if pending == 'li':
                        self.writeEndTag('li')
                        self.writeEndTag(tag)
                elif tag == 'table':
                    if pending == 'td':
                        self.writeEndTag('td')
                        self.writeEndTag('tr')
                        self.writeEndTag(tag)
                    elif pending == 'tr':
                        self.writeEndTag('tr')
                        self.writeEndTag(tag)
        else:
            self.fixes.append("Ignoring unexpected end tag %s" % tag)

    def cleanupClosingTags(self):
        """ Append any missing closing tags. Called at end."""
        while self.openTagStack:
            tag = self.pendingTag()
            if not self.enoughSpace(tag):
                continue
            self.openTagStack.pop()
            #special case for <table></table> which we want to discard
            if tag == 'table' and self.buf[-1].startswith('<table'):
                self.buf.pop()
            else:
                self.buf.append(u"</%s>" % tag)
            self.fixes.append("appended missing end tag </%s>" % tag)

    def unescape(self,s):
        '''overrides entity handling in attributeValues'''
        return s

`init(target='block', breaksAllowed=True, stripComments=False, stripUnknownEntities=True, allowImages=True, allowTables=True, allowAtags=True, allowStyleAttr=True, allowAlignAttr=True, aHrefTr=None, imgSrcTr=None, substitutions=[], maxLines=None, lineWidth=40, entities=known_entities.keys(), encoding=None, special_attr_pfxs=['rl-'])`

Initialising defines your language options. You can re-use the same parser many times. if breaksAllowed, they will be written to output. if not, in inline mode they vanish, and in block mode they end the block.

substitutions is a a singleton or list containing pat pat --> '' (pat,str) pat --> str callable c(src) --> src

pat may be a str or a compiled pattern. These substitutions are done before parsing.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def __init__(self,
            target="block",
            breaksAllowed=True,
            stripComments=False,
            stripUnknownEntities=True,
            allowImages=True,
            allowTables=True,
            allowAtags=True,
            allowStyleAttr=True, #passed through to para and headings
            allowAlignAttr=True, #passed through to para and headings  
            aHrefTr=None,
            imgSrcTr=None,
            substitutions=[],
            maxLines=None,
            lineWidth=40,
            entities = known_entities.keys(),
            encoding = None,
            special_attr_pfxs=['rl-'],
            ):
    """Initialising defines your language options.
    You can re-use the same parser many times.
    if breaksAllowed, they will be written to output.
    if not, in inline mode they vanish, and in block
    mode they end the block.

    substitutions is a a singleton or list containing
        pat         pat --> ''
        (pat,str)   pat --> str
        callable    c(src) --> src

    pat may be a str or a compiled pattern. These substitutions
    are done before parsing.
    """
    target = self.asUnicode(target)
    self.stripUnknownEntities = stripUnknownEntities
    self.allowImages = allowImages
    self.allowTables = allowTables
    self.allowAtags = allowAtags
    self.aHrefTr = aHrefTr
    self.imgSrcTr = imgSrcTr
    self.encoding = encoding
    self.allowAlignAttr = allowAlignAttr
    self.allowStyleAttr = allowStyleAttr
    self.special_attr = re.compile('^(?:%s)' % '|'.join(special_attr_pfxs)).match
    self._setupGrammar()

    assert target in (u"block", u"inline"), "unexpected block '%s', must be 'block' or 'inline'" % target
    self.target = target
    HTMLParser.__init__(self,
        **(dict(convert_charrefs=False) if sys.version_info>=(3,4) else {}))
    self.breaksAllowed = breaksAllowed
    self.stripComments = stripComments
    self.entities = set(entities).union(('lt', 'gt', 'amp'))

    #prefix up the substitutions list
    if not isinstance(substitutions,(list,tuple)):
        substitutions = (substitutions,)
    S=[].append
    for s in substitutions:
        if isinstance(s,strTypes):
            s = lambda x,pat=re.compile(asUnicode(s)): pat.sub('',x)
        elif hasattr(s,'sub'):
            s = lambda x,pat=s: pat.sub('',x)
        elif isinstance(s,(tuple,list)) and len(s)==2:
            p=s[0]
            if isinstance(p,str):
                s = lambda x,pat=re.compile(p),s=s[1]: pat.sub(s,x)
            elif hasattr(p,'sub'):
                s = lambda x,pat=p,s=s[1]: pat.sub(s,x)
            else:
                raise ValueError('Invalid value %r in substitions list' % s)
        elif not callable(s):
            raise ValueError('Invalid value %r in substitions list' % s)
        S(s)
    self.substitutions = S.__self__
    self.remainingLines = maxLines
    self.lineWidth = lineWidth
    self.textLength = 0

`allowedAttrs(tagName)`

Return set of allowed attributes for the tag

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def allowedAttrs(self, tagName):
    """Return set of allowed attributes for the tag"""
    return self._tagDict[tagName]['attrs']

`asUnicode(markup)`

convert to unicode

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def asUnicode(self, markup):
    """convert to unicode"""
    #TODO
    if not isUnicode(markup):
        try:
            markup = markup.decode('utf8', 'strict')
        except UnicodeDecodeError:
            #assume windows encoding
            markup = markup.decode('cp1252', 'replace')
    return markup

`cleanupClosingTags()`

Append any missing closing tags. Called at end.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def cleanupClosingTags(self):
    """ Append any missing closing tags. Called at end."""
    while self.openTagStack:
        tag = self.pendingTag()
        if not self.enoughSpace(tag):
            continue
        self.openTagStack.pop()
        #special case for <table></table> which we want to discard
        if tag == 'table' and self.buf[-1].startswith('<table'):
            self.buf.pop()
        else:
            self.buf.append(u"</%s>" % tag)
        self.fixes.append("appended missing end tag </%s>" % tag)

`close()`

Final tidyups

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def close(self):
    "Final tidyups"
    HTMLParser.close(self)
    self.cleanupClosingTags()

`closeCurrentBlock()`

This is used to close any pending inline tags, whenever we hit a new block start, Healthy closing never calls this.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def closeCurrentBlock(self):
    """This is used to close any pending inline tags, whenever
    we hit a new block start,  Healthy closing never calls this."""
    if self.atStart():
        return
    if self.target == 'inline':
        return #write nothing
    tag = self.pendingTag()
    if tag is not None:
        while tag in self.valid_inline_tags:
            self.writeEndTag(tag)
            tag = self.pendingTag()
    assert self._currentBlockTag is not None

    #if there are any more end-tags in the stack, chuck them.
    self.openTagStack = [self._currentBlockTag]
    self.writeEndTag(self._currentBlockTag[0])

`currentTableHasContent()`

backtrack to last

and see if we have any actual non-whitespace content

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def currentTableHasContent(self):
    'backtrack to last <table> and see if we have any actual non-whitespace content'
    pointer = -1
    item = self.buf[pointer]
    while not item.startswith('<table'):
        #print pointer, item
        pointer -= 1
        item = self.buf[pointer]
        if not item.startswith('<'):
            if item.strip():
                return True
    return False

`discardTag(tag)`

Remove everything inside this tag from the stack and buffer

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def discardTag(self, tag):
    'Remove everything inside this tag from the stack and buffer'
    while self.openTagStack:
        ctag,ctxt = self.openTagStack.pop()
        n = len(self.buf) - 1
        while n:
            if self.buf[n].startswith('<' + ctag):
                break
            n -= 1
        self.buf = self.buf[:n]
        if ctag == tag:
            return

`enoughSpace(tag)`

Tries to determine if the text in the current tag will fit on the remaining number of lines. If it does, this method returns True. If not, it will discard the text of the current tag and return False.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def enoughSpace(self, tag):
    '''
    Tries to determine if the text in the current tag will fit on the remaining number of lines.
    If it does, this method returns True.
    If not, it will discard the text of the current tag and return False.
    '''
    if self.remainingLines == None:
        return True
    if tag == 'p':
        lines = float(self.textLength) / self.lineWidth
    elif tag == 'li':
        # Count 3 characters for the bullet point
        lines = float(self.textLength) / (self.lineWidth - 3)
    else:
        return True
    import math
    lines = int(math.ceil(lines))
    self.remainingLines -= lines
    if self.remainingLines >= 0:
        self.textLength = 0
        return True
    self.discardTag(tag)
    return False

`forcedAttrs(tagName)`

Return set of forced attributes for the tag

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def forcedAttrs(self, tagName):
    """Return set of forced attributes for the tag"""
    if 'force_attrs' in self._tagDict[tagName]:
        return self._tagDict[tagName]['force_attrs']
    else:
        return None

`getContext(tag)`

Return main context for tag

g = Cleaner() eqCheck(g.getContext('i'),'inline') eqCheck(g.getContext('li'),'list')

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def getContext(self, tag):
    """Return main context for tag

    >>> g = Cleaner()
    >>> eqCheck(g.getContext('i'),'inline')
    >>> eqCheck(g.getContext('li'),'list')
    """
    context = self._tagDict[tag]['context']
    if isSeq(context):
        return context[0]
    return context

`handle_entityref(name)`

Handles a named entity.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def handle_entityref(self, name):
    "Handles a named entity.  "
    if self.stripUnknownEntities and not name in self.entities:
        return ''
    self.handle_data('&%s;' % name)

`handle_starttag(tag, attrs)`

Delete all tags except for legal ones, and strip some obvious javascript

The 'state machine' choices are all here, and in unknown_endtag. At any point, we know the current context, and the next tag which has its own context. If the next tag is expected, fine. If not, we need to handle each state transition intelligently.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def handle_starttag(self, tag, attrs):
    """ Delete all tags except for legal ones, and strip some obvious javascript

    The 'state machine' choices are all here, and in unknown_endtag. At any point,
    we know the current context, and the next tag which has its own context. If
    the next tag is expected, fine.  If not, we need to handle each state
    transition intelligently.

    """
    tag = tag.lower()

    #remove ANYTHING inside a table, if removing tables.
    if not self.allowTables:
        if self.tagInStack('table'):
            return

    if tag=='br':
        """Handles the <br/> tag.

        Called directly by sgmllib instead of unknown_starttag,
        because it's a singleton tag.  What we do depends on whether
        (a) breaks are allowed in this normalisation, and
        (b) we are inside a block or between them.

        As presently implemented, with breaksAllowed=False...
           <p>one<br/>two</p>   ->  <p>one</p><p>two</p>

        ..and multiple <br> tags beyond the first will create
        extra empty paragraphs.
        """
        if self.remainingLines != None and self.remainingLines <= 0:
            return

        if self.breaksAllowed:
            if self.isTagAllowedInContext('br', self.context):
                self.buf.append(u"<br/>")
                if self.remainingLines != None:
                    self.remainingLines -= 1
        else:
            if self.target == 'block':
                if self.context == 'inline':
                    self.closeCurrentBlock()
                elif self.context == 'block':
                    #give them an empty para for each extra <br> they added
                    self.writeStartTag(self.DEFAULT_BLOCK_TAG)
                    self.writeEndTag(self.DEFAULT_BLOCK_TAG)
            else:
                self.buf.append(' ')  #if they had two lines in the input,
                #at least a space should separate them in the output.
    elif tag in self.valid_tags:
        context = self.context
        if tag in self.tagsAllowedInContext(context):
            #expected, write it out.  The writer will filter out any unexpected attributes.
            self.writeStartTag(tag, attrs)
        else:
            #Unexpected.  Each context combination has its own rules.
            #We have 6 contexts so 36 combinations, but in most we
            #just want to ignore the tag.
            nextContext = self.getContext(tag)

            if context == 'inline':
                if nextContext == 'block':
                    self.closeCurrentBlock()
                    self.writeStartTag(tag, attrs)
                else:
                    pass   #if we get a naked tr, td, li, we'll just ignore it.
            elif context == 'block':
                if nextContext == 'inline':  #e.g. <i> at start of document
                    self.writeStartTag(self.DEFAULT_BLOCK_TAG)
                    self.writeStartTag(tag, attrs)
                elif nextContext in ('table', 'tr', 'list'):
                    #very out-of-context tag, ignore it. e.g. <p><tr>
                    pass
            elif context == 'table':
                #anything but a tr or td is disallowed
                if nextContext == 'tr':  #i.e. tag is a td
                    #they forgot the tr, repair it
                    self.writeStartTag('tr', {})
                    self.writeStartTag(tag, attrs)
                else:
                    pass
            elif context == 'tr':
                #expected a td, anything higher up means we need to close
                if nextContext == 'table':
                    #got a tr - close the current tr
                    self.writeEndTag('tr')
                    self.writeStartTag(tag, attrs)
                elif nextContext == 'block':
                    #close the whole table
                    self.writeEndTag('tr')
                    self.writeEndTag('table')
                    self.writeStartTag(tag, attrs)
                else:
                    pass
            elif context == 'td':
                #brutal for now, no tags allowed here yet, but
                #should be like inline.
                if nextContext == 'table':
                    #got a tr - close the td
                    self.writeEndTag('td')
                    self.writeEndTag('tr')
                    self.writeStartTag(tag, attrs)
                #elif nextContext == 'block':
                #    pass
                else:
                    pass
            elif context == 'list':  #tag is li
                if nextContext == 'list':   #got another li, close the last li
                    self.writeEndTag('li')
                    self.writeStartTag(tag, attrs)
                elif nextContext == 'block':
                    self.closeCurrentBlock()
                    self.writeStartTag(tag, attrs)
                else:
                    pass
            elif context == 'li':   #tag is li
                if nextContext == 'list':   #got another li, close the last li
                    self.writeEndTag('li')
                    self.writeStartTag(tag, attrs)
                elif nextContext == 'block':
                    self.writeEndTag('li')
                    self.closeCurrentBlock()
                    self.writeStartTag(tag, attrs)
                else:
                    pass
            else:
                raise ValueError("unexpected context '%s'" % context)

`isTagAllowedInContext(tag, context)`

Is the tag allowed here?

g = Cleaner() g.isTagAllowedInContext('b','block') False g.isTagAllowedInContext('a','inline') True

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def isTagAllowedInContext(self, tag, context):
    """Is the tag allowed here?

    >>> g = Cleaner()
    >>> g.isTagAllowedInContext('b','block')
    False
    >>> g.isTagAllowedInContext('a','inline')
    True
    """
    return context in self._tagDict[tag]['context']

`pendingTag()`

What tag is waiting?

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def pendingTag(self):
    "What tag is waiting?"
    try:
        return self.openTagStack[-1][0]
    except IndexError:
        return None

`process(markup)`

The main loop - call this with your markup

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def process(self, markup):
    "The main loop - call this with your markup"
    markup = self.asUnicode(markup)
    for s in self.substitutions:
        markup=s(markup)
    markup = re.sub('<([A-Za-z]+\\w*)/>', '<\\1 />', markup)
    markup = nakedAmpFix(markup.strip())
    self.reset()
    markup = self.asUnicode(markup)
    self.feed(markup)
    self.close()
    r = ''.join(self.buf)
    return r.encode(self.encoding) if self.encoding else r

`reset()`

get ready to do some work

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def reset(self):
    "get ready to do some work"
    HTMLParser.reset(self)
    self.buf = []   #holds output
    self.fixes = []  #holds warnings / debug messages / fixups done
    self.openTagStack = []      #checks for balancing
    self._started = False
    self._currentBlockTag = None   #what kind of block tag are we inside?  Usually <p>
    self._justAfterEntity = False   #flag to say if the last thing we saw was an entity.  Used to detect doubled entities in input

`tagsAllowedInContext(context)`

Set of tag names allowed in this context

g = Cleaner() eqCheck(g.tagsAllowedInContext('table'),set(['tr'])) eqCheck(g.tagsAllowedInContext('inline'),set(['em', 'a', 'b', 'sub', 'img', 'i', '', 'br', 'sup', 'strong','u']))

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def tagsAllowedInContext(self, context):
    """Set of tag names allowed in this context

    >>> g = Cleaner()
    >>> eqCheck(g.tagsAllowedInContext('table'),set(['tr']))
    >>> eqCheck(g.tagsAllowedInContext('inline'),set(['em', 'a', 'b', 'sub', 'img', 'i', '', 'br', 'sup', 'strong','u']))
    """
    #special case - extreme table removal!
    if context == 'table' and not self.allowTables:
        return []

    return self._contextDict[context]['canContain']

`unescape(s)`

overrides entity handling in attributeValues

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def unescape(self,s):
    '''overrides entity handling in attributeValues'''
    return s

`writeData(text)`

Used to write out non-tag content

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def writeData(self, text):
    "Used to write out non-tag content"
    if self.remainingLines != None and self.remainingLines <= 0:
        return
    self.textLength += len(text)
    self.buf.append(text)

`writeEndTag(tag)`

Close the tag, but check for nesting errors.

Never write to the buffer directly; this keeps the stack and mode organised.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def writeEndTag(self, tag):
    """Close the tag, but check for nesting errors.

    Never write to the buffer directly; this keeps the stack
    and mode organised."""
    if not self.enoughSpace(tag):
        return
    try:
        lastTag,lastContext = self.openTagStack.pop()
    except:
        print(self.openTagStack)
        raise
    if tag == 'table':
        if not self.allowTables:
            return
        if not self.currentTableHasContent():
            #remove everything inside the present table
            while True:
                popped = self.buf.pop()
                if popped.startswith('<table'):
                    break
            return

    #prefilter to remove all block tags in inline markup
    if self.target == 'inline':
        if tag not in self.valid_inline_tags:
            return
    if lastTag != tag:
        raise ValueError("stack is messed up trying to close %s; current open tag was %s" % (tag, lastTag))
    self.buf.append('</%s>' % tag)

`writeStartTag(tag, attrs={})`

Helper to do what it says. Called to write a tag to output.

Never write your own tags to output; instead call this. This will maintain a stack and ensure they are balanced. It also sets the mode every time for you.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

def writeStartTag(self, tag, attrs={}):
    """Helper to do what it says.  Called to write a tag to output.

    Never write your own tags to output; instead call this.  This will
    maintain a stack and ensure they are balanced. It also sets the
    mode every time for you."""
    #for table removal, we just don't write it out. It's easier
    #to have writeStartTag called (from several places) because we
    #need to keep track of the fact that we are in a table-to-be-removed.
    if self.remainingLines != None and self.remainingLines <= 0:
        return
    if tag == 'table' and not self.allowTables:
        self.openTagStack.append((tag,'table'))
        return

    #self.dump()
    #prefilter to remove all block tags in inline markup

    if tag not in self.valid_inline_tags:
        if self.target == 'inline':
            return

    adict = dict(attrs)

    if tag == 'img' and self.imgSrcTr:
        if isinstance(self.imgSrcTr, str):
            p = os.path.join(self.imgSrcTr, os.path.split(adict['src'])[-1])
            p = p.replace('\\', '/')
            adict['src'] = p
        else:
            adict['src'] = self.imgSrcTr(adict['src'])

    if tag == 'a' and self.aHrefTr:
        href = adict['href']
        if isinstance(self.aHrefTr, str):
            if not (href.startswith('http://') or href.startswith('https://')):
                adict['href'] = self.aHrefTr.rstrip('/') + '/' + href.lstrip('/')
        else:
            adict['href'] = self.aHrefTr(href)

    attrs = [ (k, adict[k]) for k, _ in attrs ]

    allowedAttrs = self.allowedAttrs(tag)
    forcedAttrs = self.forcedAttrs(tag)
    selfClosing = self._tagDict[tag]['selfClosing']
    #if selfClosing: print "found self-closing tag %s" % tag

    #rebuild the tag as a piece of text
    tagBits = ['<']
    tagBits.append(tag)
    for k, v in attrs:
        if (k in allowedAttrs or self.special_attr(k)) and v is not None:
            v = self.asUnicode(v)
            if k[0:2].lower() != 'on' and v[0:10].lower() != 'javascript':
                tagBits.append(' %s="%s"' % (k, v))

    # If there are any forced attributes
    if forcedAttrs and len(forcedAttrs) > 0:
        tag_attrs = [k for k,v in attrs]
        for k in forcedAttrs:
            if k not in tag_attrs:
                tagBits.append(' %s=""'% k)
    if selfClosing:
        tagBits.append('/>')
    else:
        tagBits.append('>')
    tagText = ''.join(tagBits)

    self.buf.append(tagText)

    #and put it on the stack....
    if not selfClosing:
        context = self.context  #current context
        #if block, remember how to close
        if context == 'block':
            self._currentBlockTag = (tag,'block')
        #set the mode
        if tag == 'table':
            ncontext = 'table'
        elif tag == 'tr':
            ncontext = 'tr'
        elif tag in ('td', 'th'):
            ncontext = 'td'
        elif tag in ('ul', 'ol'):
            ncontext = 'list'
        elif tag == 'li':
            ncontext = 'li'
        else:
            #block and inline always lead to inline
            ncontext = 'inline'
        self.openTagStack.append((tag,ncontext))

`cleanBlocks(input, **options)`

Accept markup as one or more blocks.

The output of this should be safe for use within a

or tag in HTML, and also convertible to RML.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

@unifunc
def cleanBlocks(input, **options):
    """Accept markup as one or more blocks.

    The output of this should be safe for use within a
    <div> or <body> tag in HTML, and also convertible to RML.

    """
    return Cleaner(target='block', **options).process(input)

`cleanInline(input, **options)`

Accept and normalize markup for use inline.

The output of this should be safe for use within a

tag in HTML, and also convertible to RML.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

@unifunc
def cleanInline(input, **options):
    """Accept and normalize markup for use inline.

    The output of this should be safe for use within a
    <p> tag in HTML, and also convertible to RML.
    """
    return Cleaner(target='inline', **options).process(input)

`cleanPlain(input, **options)`

Remove all tags to output plain text.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

@unifunc
def cleanPlain(input, **options):
    """Remove all tags to output plain text.

    """
    return escape(stripTags(input))

`filterRE(s, r)`

Substitutes the matches of r in str with an empty string.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

@unifunc
def filterRE(s, r):
    'Substitutes the matches of r in str with an empty string.'
    try:
        sub = r.sub
    except AttributeError:
        return re.sub(r, '', s)
    return sub('', s)

`fixTruncated(s)`

Try to remove truncated tags at the end of str.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

@unifunc
def fixTruncated(s):
    'Try to remove truncated tags at the end of str.'
    return filterRE(s, truncated_tag)

`truncateHTML(input, maxLines, **options)`

Truncates html to a maximum of maxlength characters. Tags don't count towards the character count. Lists, tables and other big blocks get removed completed if the character limit is reached inside.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py

@unifunc
def truncateHTML(input, maxLines, **options):
    '''
    Truncates html to a maximum of maxlength characters.
    Tags don't count towards the character count.
    Lists, tables and other big blocks get removed completed if
    the character limit is reached inside.
    '''
    return Cleaner(breaksAllowed=False, maxLines=maxLines, **options).process(input)

functions in xhtml2rml

`xhtml2rml(xml, paraStyle='normal', tableStyle='noPaddingStyle', bulletStyle='bullet', pathTransform=None, imageTransformKwds={}, allowMailtoLinks=False, useModernLists=True, ulStyle=None, olStyle=None, liParaStyle=None, tagAttrs={},)`

Convert chunk of our mini-html to RML.