Skip to content

ReportLab's XML Tools

We've been around for a very long time. When we got started with RML in 2001, we rapidly found a need for a lot of markup processing tools, and Python did not have the standard solutions it has today.

This page documents some of the functions in the rlextra.radxml package which you might find useful.

Sanitising markup, and converting to RML

functions in html_cleaner

Here are some important clean functions within rlextra.

Cleaner

Bases: HTMLParser

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
class Cleaner(HTMLParser):

    def __init__(self,
                target="block",
                breaksAllowed=True,
                stripComments=False,
                stripUnknownEntities=True,
                allowImages=True,
                allowTables=True,
                allowAtags=True,
                allowStyleAttr=True, #passed through to para and headings
                allowAlignAttr=True, #passed through to para and headings  
                aHrefTr=None,
                imgSrcTr=None,
                substitutions=[],
                maxLines=None,
                lineWidth=40,
                entities = known_entities.keys(),
                encoding = None,
                special_attr_pfxs=['rl-'],
                ):
        """Initialising defines your language options.
        You can re-use the same parser many times.
        if breaksAllowed, they will be written to output.
        if not, in inline mode they vanish, and in block
        mode they end the block.

        substitutions is a a singleton or list containing
            pat         pat --> ''
            (pat,str)   pat --> str
            callable    c(src) --> src

        pat may be a str or a compiled pattern. These substitutions
        are done before parsing.
        """
        target = self.asUnicode(target)
        self.stripUnknownEntities = stripUnknownEntities
        self.allowImages = allowImages
        self.allowTables = allowTables
        self.allowAtags = allowAtags
        self.aHrefTr = aHrefTr
        self.imgSrcTr = imgSrcTr
        self.encoding = encoding
        self.allowAlignAttr = allowAlignAttr
        self.allowStyleAttr = allowStyleAttr
        self.special_attr = re.compile('^(?:%s)' % '|'.join(special_attr_pfxs)).match
        self._setupGrammar()

        assert target in (u"block", u"inline"), "unexpected block '%s', must be 'block' or 'inline'" % target
        self.target = target
        HTMLParser.__init__(self,
            **(dict(convert_charrefs=False) if sys.version_info>=(3,4) else {}))
        self.breaksAllowed = breaksAllowed
        self.stripComments = stripComments
        self.entities = set(entities).union(('lt', 'gt', 'amp'))

        #prefix up the substitutions list
        if not isinstance(substitutions,(list,tuple)):
            substitutions = (substitutions,)
        S=[].append
        for s in substitutions:
            if isinstance(s,strTypes):
                s = lambda x,pat=re.compile(asUnicode(s)): pat.sub('',x)
            elif hasattr(s,'sub'):
                s = lambda x,pat=s: pat.sub('',x)
            elif isinstance(s,(tuple,list)) and len(s)==2:
                p=s[0]
                if isinstance(p,str):
                    s = lambda x,pat=re.compile(p),s=s[1]: pat.sub(s,x)
                elif hasattr(p,'sub'):
                    s = lambda x,pat=p,s=s[1]: pat.sub(s,x)
                else:
                    raise ValueError('Invalid value %r in substitions list' % s)
            elif not callable(s):
                raise ValueError('Invalid value %r in substitions list' % s)
            S(s)
        self.substitutions = S.__self__
        self.remainingLines = maxLines
        self.lineWidth = lineWidth
        self.textLength = 0

    def _setupGrammar(self):

        self.DEFAULT_BLOCK_TAG = 'p'    #used to close blocks if nothing else given
        def uniDict(d):
            r = {}
            for k in d:
                r[asUnicode(k)] = d[k]
            return r

        def byName(listOfDicts):
            out = {}
            for d in listOfDicts:
                d = uniDict(d)
                out[d['name']] = d
            return out

        #situations in which a tag can appear.
        self._contextDict = byName([
            dict(name='block', allowedIn=None),  #None=top-level
            dict(name='inline', allowedIn='block'),
            dict(name='table', allowedIn='block'),
            dict(name='list', allowedIn='block'),
            dict(name='li', allowedIn='list'),
            dict(name='tr', allowedIn='table'),
            dict(name='td', allowedIn='tr'),    #no contents for now
            ])
#        if not self.allowTables:
#            del self._contextDict['table']

        for context in self._contextDict.values():
            context['canContain'] = set()

        allowParaAttrs = []
        if self.allowStyleAttr: allowParaAttrs.append('style')
        if self.allowAlignAttr: allowParaAttrs.append('align')

        self._tagDict = byName([
            dict(name='p',context='block', attrs=allowParaAttrs),
            dict(name='h1',context='block', attrs=allowParaAttrs),
            dict(name='h2',context='block', attrs=allowParaAttrs),
            dict(name='h3',context='block', attrs=allowParaAttrs),
            dict(name='h4',context='block', attrs=allowParaAttrs),
            dict(name='h5',context='block', attrs=allowParaAttrs),
            dict(name='h6',context='block', attrs=allowParaAttrs),

            dict(name='strong',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='em',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='i',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='b',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='u',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='sup',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='sub',context=('inline', 'li', 'td'), attrs=[]),
            dict(name='br',context=('inline', 'li', 'td'), attrs=[], selfClosing=True),
            dict(name='a',context=('inline', 'li', 'td'), attrs=['href']),

            # force_attrs. These attributes will be added to make sure xhtml validation passes.
            dict(name='img',context=('inline', 'li'), attrs=['src','width','height','alt'], force_attrs=['alt'], selfClosing=True),

            dict(name='table',context=('block','td'), attrs=[]),
            dict(name='tr',context='table', attrs=[]),
            dict(name='td',context='tr', attrs=[]),
            dict(name='th',context='tr', attrs=[]),

            dict(name='ul',context=('block','td', 'li'), attrs=[]),
            dict(name='ol',context=('block','td', 'li'), attrs=[]),
            dict(name='li',context='list', attrs=[]),
            ])

        # Tags to use to cover naked text up with in the given context
        self._dataCover = uniDict(dict(
            block=(self.DEFAULT_BLOCK_TAG,),
            table=('tr', 'td'),
            tr=('td',),
            #td=('p',),
            list=('li',),
            ))

#        if not self.allowTables:
#            del self._tagDict['table']
#            del self._tagDict['tr']
#            del self._tagDict['td']

        if not self.allowImages:
            del self._tagDict['img']
        if not self.allowAtags:
            del self._tagDict['a']

        #work out in-place the set of tags allowed in each context.

        for tagName,tag in self._tagDict.items():
            if 'selfClosing' not in tag:
                tag['selfClosing'] = False

            contexts = tag['context']
            if not isSeq(contexts):
                contexts = contexts,
            for ctxName in contexts:
                context = self._contextDict.get(ctxName)
                context['canContain'].add(tagName)

        #work out certain useful attributes
        self.valid_tags = set(self._tagDict.keys())
        self.valid_block_tags = self.tagsAllowedInContext('block')
        self.valid_inline_tags = self.tagsAllowedInContext('inline')
        self.valid_other_tags = self.valid_tags - self.valid_block_tags - self.valid_inline_tags

    def allowedAttrs(self, tagName):
        """Return set of allowed attributes for the tag"""
        return self._tagDict[tagName]['attrs']

    def forcedAttrs(self, tagName):
        """Return set of forced attributes for the tag"""
        if 'force_attrs' in self._tagDict[tagName]:
            return self._tagDict[tagName]['force_attrs']
        else:
            return None

    def getContext(self, tag):
        """Return main context for tag

        >>> g = Cleaner()
        >>> eqCheck(g.getContext('i'),'inline')
        >>> eqCheck(g.getContext('li'),'list')
        """
        context = self._tagDict[tag]['context']
        if isSeq(context):
            return context[0]
        return context

    def context(self):
        if self.openTagStack:
            c = self.openTagStack[-1][1]
        else:
            c = self.target == 'block' and 'block' or 'inline'
        return c
    context=property(context)

    def isTagAllowedInContext(self, tag, context):
        """Is the tag allowed here?

        >>> g = Cleaner()
        >>> g.isTagAllowedInContext('b','block')
        False
        >>> g.isTagAllowedInContext('a','inline')
        True
        """
        return context in self._tagDict[tag]['context']

    def tagsAllowedInContext(self, context):
        """Set of tag names allowed in this context

        >>> g = Cleaner()
        >>> eqCheck(g.tagsAllowedInContext('table'),set(['tr']))
        >>> eqCheck(g.tagsAllowedInContext('inline'),set(['em', 'a', 'b', 'sub', 'img', 'i', '', 'br', 'sup', 'strong','u']))
        """
        #special case - extreme table removal!
        if context == 'table' and not self.allowTables:
            return []

        return self._contextDict[context]['canContain']

    def reset(self):
        "get ready to do some work"
        HTMLParser.reset(self)
        self.buf = []   #holds output
        self.fixes = []  #holds warnings / debug messages / fixups done
        self.openTagStack = []      #checks for balancing
        self._started = False
        self._currentBlockTag = None   #what kind of block tag are we inside?  Usually <p>
        self._justAfterEntity = False   #flag to say if the last thing we saw was an entity.  Used to detect doubled entities in input

    def close(self):
        "Final tidyups"
        HTMLParser.close(self)
        self.cleanupClosingTags()

    def process(self, markup):
        "The main loop - call this with your markup"
        markup = self.asUnicode(markup)
        for s in self.substitutions:
            markup=s(markup)
        markup = re.sub('<([A-Za-z]+\\w*)/>', '<\\1 />', markup)
        markup = nakedAmpFix(markup.strip())
        self.reset()
        markup = self.asUnicode(markup)
        self.feed(markup)
        self.close()
        r = ''.join(self.buf)
        return r.encode(self.encoding) if self.encoding else r

    def dump(self):
        print(''.join(self.buf))

    def asUnicode(self, markup):
        """convert to unicode"""
        #TODO
        if not isUnicode(markup):
            try:
                markup = markup.decode('utf8', 'strict')
            except UnicodeDecodeError:
                #assume windows encoding
                markup = markup.decode('cp1252', 'replace')
        return markup

    def writeStartTag(self, tag, attrs={}):
        """Helper to do what it says.  Called to write a tag to output.

        Never write your own tags to output; instead call this.  This will
        maintain a stack and ensure they are balanced. It also sets the
        mode every time for you."""
        #for table removal, we just don't write it out. It's easier
        #to have writeStartTag called (from several places) because we
        #need to keep track of the fact that we are in a table-to-be-removed.
        if self.remainingLines != None and self.remainingLines <= 0:
            return
        if tag == 'table' and not self.allowTables:
            self.openTagStack.append((tag,'table'))
            return

        #self.dump()
        #prefilter to remove all block tags in inline markup

        if tag not in self.valid_inline_tags:
            if self.target == 'inline':
                return

        adict = dict(attrs)

        if tag == 'img' and self.imgSrcTr:
            if isinstance(self.imgSrcTr, str):
                p = os.path.join(self.imgSrcTr, os.path.split(adict['src'])[-1])
                p = p.replace('\\', '/')
                adict['src'] = p
            else:
                adict['src'] = self.imgSrcTr(adict['src'])

        if tag == 'a' and self.aHrefTr:
            href = adict['href']
            if isinstance(self.aHrefTr, str):
                if not (href.startswith('http://') or href.startswith('https://')):
                    adict['href'] = self.aHrefTr.rstrip('/') + '/' + href.lstrip('/')
            else:
                adict['href'] = self.aHrefTr(href)

        attrs = [ (k, adict[k]) for k, _ in attrs ]

        allowedAttrs = self.allowedAttrs(tag)
        forcedAttrs = self.forcedAttrs(tag)
        selfClosing = self._tagDict[tag]['selfClosing']
        #if selfClosing: print "found self-closing tag %s" % tag

        #rebuild the tag as a piece of text
        tagBits = ['<']
        tagBits.append(tag)
        for k, v in attrs:
            if (k in allowedAttrs or self.special_attr(k)) and v is not None:
                v = self.asUnicode(v)
                if k[0:2].lower() != 'on' and v[0:10].lower() != 'javascript':
                    tagBits.append(' %s="%s"' % (k, v))

        # If there are any forced attributes
        if forcedAttrs and len(forcedAttrs) > 0:
            tag_attrs = [k for k,v in attrs]
            for k in forcedAttrs:
                if k not in tag_attrs:
                    tagBits.append(' %s=""'% k)
        if selfClosing:
            tagBits.append('/>')
        else:
            tagBits.append('>')
        tagText = ''.join(tagBits)

        self.buf.append(tagText)

        #and put it on the stack....
        if not selfClosing:
            context = self.context  #current context
            #if block, remember how to close
            if context == 'block':
                self._currentBlockTag = (tag,'block')
            #set the mode
            if tag == 'table':
                ncontext = 'table'
            elif tag == 'tr':
                ncontext = 'tr'
            elif tag in ('td', 'th'):
                ncontext = 'td'
            elif tag in ('ul', 'ol'):
                ncontext = 'list'
            elif tag == 'li':
                ncontext = 'li'
            else:
                #block and inline always lead to inline
                ncontext = 'inline'
            self.openTagStack.append((tag,ncontext))

    def writeEndTag(self, tag):
        """Close the tag, but check for nesting errors.

        Never write to the buffer directly; this keeps the stack
        and mode organised."""
        if not self.enoughSpace(tag):
            return
        try:
            lastTag,lastContext = self.openTagStack.pop()
        except:
            print(self.openTagStack)
            raise
        if tag == 'table':
            if not self.allowTables:
                return
            if not self.currentTableHasContent():
                #remove everything inside the present table
                while True:
                    popped = self.buf.pop()
                    if popped.startswith('<table'):
                        break
                return

        #prefilter to remove all block tags in inline markup
        if self.target == 'inline':
            if tag not in self.valid_inline_tags:
                return
        if lastTag != tag:
            raise ValueError("stack is messed up trying to close %s; current open tag was %s" % (tag, lastTag))
        self.buf.append('</%s>' % tag)

    def pendingTag(self):
        "What tag is waiting?"
        try:
            return self.openTagStack[-1][0]
        except IndexError:
            return None

    def atStart(self):
        return (len(self.buf) == 0)

    def discardTag(self, tag):
        'Remove everything inside this tag from the stack and buffer'
        while self.openTagStack:
            ctag,ctxt = self.openTagStack.pop()
            n = len(self.buf) - 1
            while n:
                if self.buf[n].startswith('<' + ctag):
                    break
                n -= 1
            self.buf = self.buf[:n]
            if ctag == tag:
                return


    def currentTableHasContent(self):
        'backtrack to last <table> and see if we have any actual non-whitespace content'
        pointer = -1
        item = self.buf[pointer]
        while not item.startswith('<table'):
            #print pointer, item
            pointer -= 1
            item = self.buf[pointer]
            if not item.startswith('<'):
                if item.strip():
                    return True
        return False

    def enoughSpace(self, tag):
        '''
        Tries to determine if the text in the current tag will fit on the remaining number of lines.
        If it does, this method returns True.
        If not, it will discard the text of the current tag and return False.
        '''
        if self.remainingLines == None:
            return True
        if tag == 'p':
            lines = float(self.textLength) / self.lineWidth
        elif tag == 'li':
            # Count 3 characters for the bullet point
            lines = float(self.textLength) / (self.lineWidth - 3)
        else:
            return True
        import math
        lines = int(math.ceil(lines))
        self.remainingLines -= lines
        if self.remainingLines >= 0:
            self.textLength = 0
            return True
        self.discardTag(tag)
        return False

    def writeData(self, text):
        "Used to write out non-tag content"
        if self.remainingLines != None and self.remainingLines <= 0:
            return
        self.textLength += len(text)
        self.buf.append(text)

    def closeCurrentBlock(self):
        """This is used to close any pending inline tags, whenever
        we hit a new block start,  Healthy closing never calls this."""
        if self.atStart():
            return
        if self.target == 'inline':
            return #write nothing
        tag = self.pendingTag()
        if tag is not None:
            while tag in self.valid_inline_tags:
                self.writeEndTag(tag)
                tag = self.pendingTag()
        assert self._currentBlockTag is not None

        #if there are any more end-tags in the stack, chuck them.
        self.openTagStack = [self._currentBlockTag]
        self.writeEndTag(self._currentBlockTag[0])

    def tagInStack(self,tag):
        stack = self.openTagStack
        x = len(stack)
        while x:
            x -= 1
            if tag==stack[x][0]: return True
        return False

    def handle_data(self, data):
        data = data.replace('<', '&lt;').replace('>', '&gt;')

        if not data.strip():
            self.writeData(data)
            return

        if not self.allowTables:
            if self.tagInStack('table'):
                return

        #are we in the right mode?
        tags = self._dataCover.get(self.context,[])
        for t in tags:
            self.writeStartTag(t)

        self.writeData(data)

    def handle_comment(self, name):
        if not self.stripComments:
            self.buf.append('<!--' + self.asUnicode(name) + '-->')

    def handle_entityref(self, name):
        "Handles a named entity.  "
        if self.stripUnknownEntities and not name in self.entities:
            return ''
        self.handle_data('&%s;' % name)

    def handle_charref(self,name):
        self.handle_data('&#%s;' % name)

    def handle_starttag(self, tag, attrs):
        """ Delete all tags except for legal ones, and strip some obvious javascript

        The 'state machine' choices are all here, and in unknown_endtag. At any point,
        we know the current context, and the next tag which has its own context. If
        the next tag is expected, fine.  If not, we need to handle each state
        transition intelligently.

        """
        tag = tag.lower()

        #remove ANYTHING inside a table, if removing tables.
        if not self.allowTables:
            if self.tagInStack('table'):
                return

        if tag=='br':
            """Handles the <br/> tag.

            Called directly by sgmllib instead of unknown_starttag,
            because it's a singleton tag.  What we do depends on whether
            (a) breaks are allowed in this normalisation, and
            (b) we are inside a block or between them.

            As presently implemented, with breaksAllowed=False...
               <p>one<br/>two</p>   ->  <p>one</p><p>two</p>

            ..and multiple <br> tags beyond the first will create
            extra empty paragraphs.
            """
            if self.remainingLines != None and self.remainingLines <= 0:
                return

            if self.breaksAllowed:
                if self.isTagAllowedInContext('br', self.context):
                    self.buf.append(u"<br/>")
                    if self.remainingLines != None:
                        self.remainingLines -= 1
            else:
                if self.target == 'block':
                    if self.context == 'inline':
                        self.closeCurrentBlock()
                    elif self.context == 'block':
                        #give them an empty para for each extra <br> they added
                        self.writeStartTag(self.DEFAULT_BLOCK_TAG)
                        self.writeEndTag(self.DEFAULT_BLOCK_TAG)
                else:
                    self.buf.append(' ')  #if they had two lines in the input,
                    #at least a space should separate them in the output.
        elif tag in self.valid_tags:
            context = self.context
            if tag in self.tagsAllowedInContext(context):
                #expected, write it out.  The writer will filter out any unexpected attributes.
                self.writeStartTag(tag, attrs)
            else:
                #Unexpected.  Each context combination has its own rules.
                #We have 6 contexts so 36 combinations, but in most we
                #just want to ignore the tag.
                nextContext = self.getContext(tag)

                if context == 'inline':
                    if nextContext == 'block':
                        self.closeCurrentBlock()
                        self.writeStartTag(tag, attrs)
                    else:
                        pass   #if we get a naked tr, td, li, we'll just ignore it.
                elif context == 'block':
                    if nextContext == 'inline':  #e.g. <i> at start of document
                        self.writeStartTag(self.DEFAULT_BLOCK_TAG)
                        self.writeStartTag(tag, attrs)
                    elif nextContext in ('table', 'tr', 'list'):
                        #very out-of-context tag, ignore it. e.g. <p><tr>
                        pass
                elif context == 'table':
                    #anything but a tr or td is disallowed
                    if nextContext == 'tr':  #i.e. tag is a td
                        #they forgot the tr, repair it
                        self.writeStartTag('tr', {})
                        self.writeStartTag(tag, attrs)
                    else:
                        pass
                elif context == 'tr':
                    #expected a td, anything higher up means we need to close
                    if nextContext == 'table':
                        #got a tr - close the current tr
                        self.writeEndTag('tr')
                        self.writeStartTag(tag, attrs)
                    elif nextContext == 'block':
                        #close the whole table
                        self.writeEndTag('tr')
                        self.writeEndTag('table')
                        self.writeStartTag(tag, attrs)
                    else:
                        pass
                elif context == 'td':
                    #brutal for now, no tags allowed here yet, but
                    #should be like inline.
                    if nextContext == 'table':
                        #got a tr - close the td
                        self.writeEndTag('td')
                        self.writeEndTag('tr')
                        self.writeStartTag(tag, attrs)
                    #elif nextContext == 'block':
                    #    pass
                    else:
                        pass
                elif context == 'list':  #tag is li
                    if nextContext == 'list':   #got another li, close the last li
                        self.writeEndTag('li')
                        self.writeStartTag(tag, attrs)
                    elif nextContext == 'block':
                        self.closeCurrentBlock()
                        self.writeStartTag(tag, attrs)
                    else:
                        pass
                elif context == 'li':   #tag is li
                    if nextContext == 'list':   #got another li, close the last li
                        self.writeEndTag('li')
                        self.writeStartTag(tag, attrs)
                    elif nextContext == 'block':
                        self.writeEndTag('li')
                        self.closeCurrentBlock()
                        self.writeStartTag(tag, attrs)
                    else:
                        pass
                else:
                    raise ValueError("unexpected context '%s'" % context)

    def handle_endtag(self, tag):
        tag = tag.lower()
        if tag in self.valid_tags:
            pending = self.pendingTag()
            if tag == pending:
                #all is wonderful, close it
                self.writeEndTag(tag)
            else:
                #normally, we just ignore unexpected end tags.
                #the stack will be closed at the end.  However
                #if we get a </table>, </ul> etc, it may make
                #sense to close.
                if tag == 'tr':
                    if pending == 'td':
                        #close it
                        self.writeEndTag('td')
                        self.writeEndTag(tag)
                elif tag in ('ul','ol'):
                    if pending == 'li':
                        self.writeEndTag('li')
                        self.writeEndTag(tag)
                elif tag == 'table':
                    if pending == 'td':
                        self.writeEndTag('td')
                        self.writeEndTag('tr')
                        self.writeEndTag(tag)
                    elif pending == 'tr':
                        self.writeEndTag('tr')
                        self.writeEndTag(tag)
        else:
            self.fixes.append("Ignoring unexpected end tag %s" % tag)

    def cleanupClosingTags(self):
        """ Append any missing closing tags. Called at end."""
        while self.openTagStack:
            tag = self.pendingTag()
            if not self.enoughSpace(tag):
                continue
            self.openTagStack.pop()
            #special case for <table></table> which we want to discard
            if tag == 'table' and self.buf[-1].startswith('<table'):
                self.buf.pop()
            else:
                self.buf.append(u"</%s>" % tag)
            self.fixes.append("appended missing end tag </%s>" % tag)

    def unescape(self,s):
        '''overrides entity handling in attributeValues'''
        return s

__init__(target='block', breaksAllowed=True, stripComments=False, stripUnknownEntities=True, allowImages=True, allowTables=True, allowAtags=True, allowStyleAttr=True, allowAlignAttr=True, aHrefTr=None, imgSrcTr=None, substitutions=[], maxLines=None, lineWidth=40, entities=known_entities.keys(), encoding=None, special_attr_pfxs=['rl-'])

Initialising defines your language options. You can re-use the same parser many times. if breaksAllowed, they will be written to output. if not, in inline mode they vanish, and in block mode they end the block.

substitutions is a a singleton or list containing pat pat --> '' (pat,str) pat --> str callable c(src) --> src

pat may be a str or a compiled pattern. These substitutions are done before parsing.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def __init__(self,
            target="block",
            breaksAllowed=True,
            stripComments=False,
            stripUnknownEntities=True,
            allowImages=True,
            allowTables=True,
            allowAtags=True,
            allowStyleAttr=True, #passed through to para and headings
            allowAlignAttr=True, #passed through to para and headings  
            aHrefTr=None,
            imgSrcTr=None,
            substitutions=[],
            maxLines=None,
            lineWidth=40,
            entities = known_entities.keys(),
            encoding = None,
            special_attr_pfxs=['rl-'],
            ):
    """Initialising defines your language options.
    You can re-use the same parser many times.
    if breaksAllowed, they will be written to output.
    if not, in inline mode they vanish, and in block
    mode they end the block.

    substitutions is a a singleton or list containing
        pat         pat --> ''
        (pat,str)   pat --> str
        callable    c(src) --> src

    pat may be a str or a compiled pattern. These substitutions
    are done before parsing.
    """
    target = self.asUnicode(target)
    self.stripUnknownEntities = stripUnknownEntities
    self.allowImages = allowImages
    self.allowTables = allowTables
    self.allowAtags = allowAtags
    self.aHrefTr = aHrefTr
    self.imgSrcTr = imgSrcTr
    self.encoding = encoding
    self.allowAlignAttr = allowAlignAttr
    self.allowStyleAttr = allowStyleAttr
    self.special_attr = re.compile('^(?:%s)' % '|'.join(special_attr_pfxs)).match
    self._setupGrammar()

    assert target in (u"block", u"inline"), "unexpected block '%s', must be 'block' or 'inline'" % target
    self.target = target
    HTMLParser.__init__(self,
        **(dict(convert_charrefs=False) if sys.version_info>=(3,4) else {}))
    self.breaksAllowed = breaksAllowed
    self.stripComments = stripComments
    self.entities = set(entities).union(('lt', 'gt', 'amp'))

    #prefix up the substitutions list
    if not isinstance(substitutions,(list,tuple)):
        substitutions = (substitutions,)
    S=[].append
    for s in substitutions:
        if isinstance(s,strTypes):
            s = lambda x,pat=re.compile(asUnicode(s)): pat.sub('',x)
        elif hasattr(s,'sub'):
            s = lambda x,pat=s: pat.sub('',x)
        elif isinstance(s,(tuple,list)) and len(s)==2:
            p=s[0]
            if isinstance(p,str):
                s = lambda x,pat=re.compile(p),s=s[1]: pat.sub(s,x)
            elif hasattr(p,'sub'):
                s = lambda x,pat=p,s=s[1]: pat.sub(s,x)
            else:
                raise ValueError('Invalid value %r in substitions list' % s)
        elif not callable(s):
            raise ValueError('Invalid value %r in substitions list' % s)
        S(s)
    self.substitutions = S.__self__
    self.remainingLines = maxLines
    self.lineWidth = lineWidth
    self.textLength = 0

allowedAttrs(tagName)

Return set of allowed attributes for the tag

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def allowedAttrs(self, tagName):
    """Return set of allowed attributes for the tag"""
    return self._tagDict[tagName]['attrs']

asUnicode(markup)

convert to unicode

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def asUnicode(self, markup):
    """convert to unicode"""
    #TODO
    if not isUnicode(markup):
        try:
            markup = markup.decode('utf8', 'strict')
        except UnicodeDecodeError:
            #assume windows encoding
            markup = markup.decode('cp1252', 'replace')
    return markup

cleanupClosingTags()

Append any missing closing tags. Called at end.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def cleanupClosingTags(self):
    """ Append any missing closing tags. Called at end."""
    while self.openTagStack:
        tag = self.pendingTag()
        if not self.enoughSpace(tag):
            continue
        self.openTagStack.pop()
        #special case for <table></table> which we want to discard
        if tag == 'table' and self.buf[-1].startswith('<table'):
            self.buf.pop()
        else:
            self.buf.append(u"</%s>" % tag)
        self.fixes.append("appended missing end tag </%s>" % tag)

close()

Final tidyups

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def close(self):
    "Final tidyups"
    HTMLParser.close(self)
    self.cleanupClosingTags()

closeCurrentBlock()

This is used to close any pending inline tags, whenever we hit a new block start, Healthy closing never calls this.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def closeCurrentBlock(self):
    """This is used to close any pending inline tags, whenever
    we hit a new block start,  Healthy closing never calls this."""
    if self.atStart():
        return
    if self.target == 'inline':
        return #write nothing
    tag = self.pendingTag()
    if tag is not None:
        while tag in self.valid_inline_tags:
            self.writeEndTag(tag)
            tag = self.pendingTag()
    assert self._currentBlockTag is not None

    #if there are any more end-tags in the stack, chuck them.
    self.openTagStack = [self._currentBlockTag]
    self.writeEndTag(self._currentBlockTag[0])

currentTableHasContent()

backtrack to last

and see if we have any actual non-whitespace content

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def currentTableHasContent(self):
    'backtrack to last <table> and see if we have any actual non-whitespace content'
    pointer = -1
    item = self.buf[pointer]
    while not item.startswith('<table'):
        #print pointer, item
        pointer -= 1
        item = self.buf[pointer]
        if not item.startswith('<'):
            if item.strip():
                return True
    return False

discardTag(tag)

Remove everything inside this tag from the stack and buffer

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def discardTag(self, tag):
    'Remove everything inside this tag from the stack and buffer'
    while self.openTagStack:
        ctag,ctxt = self.openTagStack.pop()
        n = len(self.buf) - 1
        while n:
            if self.buf[n].startswith('<' + ctag):
                break
            n -= 1
        self.buf = self.buf[:n]
        if ctag == tag:
            return

enoughSpace(tag)

Tries to determine if the text in the current tag will fit on the remaining number of lines. If it does, this method returns True. If not, it will discard the text of the current tag and return False.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def enoughSpace(self, tag):
    '''
    Tries to determine if the text in the current tag will fit on the remaining number of lines.
    If it does, this method returns True.
    If not, it will discard the text of the current tag and return False.
    '''
    if self.remainingLines == None:
        return True
    if tag == 'p':
        lines = float(self.textLength) / self.lineWidth
    elif tag == 'li':
        # Count 3 characters for the bullet point
        lines = float(self.textLength) / (self.lineWidth - 3)
    else:
        return True
    import math
    lines = int(math.ceil(lines))
    self.remainingLines -= lines
    if self.remainingLines >= 0:
        self.textLength = 0
        return True
    self.discardTag(tag)
    return False

forcedAttrs(tagName)

Return set of forced attributes for the tag

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def forcedAttrs(self, tagName):
    """Return set of forced attributes for the tag"""
    if 'force_attrs' in self._tagDict[tagName]:
        return self._tagDict[tagName]['force_attrs']
    else:
        return None

getContext(tag)

Return main context for tag

g = Cleaner() eqCheck(g.getContext('i'),'inline') eqCheck(g.getContext('li'),'list')

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def getContext(self, tag):
    """Return main context for tag

    >>> g = Cleaner()
    >>> eqCheck(g.getContext('i'),'inline')
    >>> eqCheck(g.getContext('li'),'list')
    """
    context = self._tagDict[tag]['context']
    if isSeq(context):
        return context[0]
    return context

handle_entityref(name)

Handles a named entity.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def handle_entityref(self, name):
    "Handles a named entity.  "
    if self.stripUnknownEntities and not name in self.entities:
        return ''
    self.handle_data('&%s;' % name)

handle_starttag(tag, attrs)

Delete all tags except for legal ones, and strip some obvious javascript

The 'state machine' choices are all here, and in unknown_endtag. At any point, we know the current context, and the next tag which has its own context. If the next tag is expected, fine. If not, we need to handle each state transition intelligently.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def handle_starttag(self, tag, attrs):
    """ Delete all tags except for legal ones, and strip some obvious javascript

    The 'state machine' choices are all here, and in unknown_endtag. At any point,
    we know the current context, and the next tag which has its own context. If
    the next tag is expected, fine.  If not, we need to handle each state
    transition intelligently.

    """
    tag = tag.lower()

    #remove ANYTHING inside a table, if removing tables.
    if not self.allowTables:
        if self.tagInStack('table'):
            return

    if tag=='br':
        """Handles the <br/> tag.

        Called directly by sgmllib instead of unknown_starttag,
        because it's a singleton tag.  What we do depends on whether
        (a) breaks are allowed in this normalisation, and
        (b) we are inside a block or between them.

        As presently implemented, with breaksAllowed=False...
           <p>one<br/>two</p>   ->  <p>one</p><p>two</p>

        ..and multiple <br> tags beyond the first will create
        extra empty paragraphs.
        """
        if self.remainingLines != None and self.remainingLines <= 0:
            return

        if self.breaksAllowed:
            if self.isTagAllowedInContext('br', self.context):
                self.buf.append(u"<br/>")
                if self.remainingLines != None:
                    self.remainingLines -= 1
        else:
            if self.target == 'block':
                if self.context == 'inline':
                    self.closeCurrentBlock()
                elif self.context == 'block':
                    #give them an empty para for each extra <br> they added
                    self.writeStartTag(self.DEFAULT_BLOCK_TAG)
                    self.writeEndTag(self.DEFAULT_BLOCK_TAG)
            else:
                self.buf.append(' ')  #if they had two lines in the input,
                #at least a space should separate them in the output.
    elif tag in self.valid_tags:
        context = self.context
        if tag in self.tagsAllowedInContext(context):
            #expected, write it out.  The writer will filter out any unexpected attributes.
            self.writeStartTag(tag, attrs)
        else:
            #Unexpected.  Each context combination has its own rules.
            #We have 6 contexts so 36 combinations, but in most we
            #just want to ignore the tag.
            nextContext = self.getContext(tag)

            if context == 'inline':
                if nextContext == 'block':
                    self.closeCurrentBlock()
                    self.writeStartTag(tag, attrs)
                else:
                    pass   #if we get a naked tr, td, li, we'll just ignore it.
            elif context == 'block':
                if nextContext == 'inline':  #e.g. <i> at start of document
                    self.writeStartTag(self.DEFAULT_BLOCK_TAG)
                    self.writeStartTag(tag, attrs)
                elif nextContext in ('table', 'tr', 'list'):
                    #very out-of-context tag, ignore it. e.g. <p><tr>
                    pass
            elif context == 'table':
                #anything but a tr or td is disallowed
                if nextContext == 'tr':  #i.e. tag is a td
                    #they forgot the tr, repair it
                    self.writeStartTag('tr', {})
                    self.writeStartTag(tag, attrs)
                else:
                    pass
            elif context == 'tr':
                #expected a td, anything higher up means we need to close
                if nextContext == 'table':
                    #got a tr - close the current tr
                    self.writeEndTag('tr')
                    self.writeStartTag(tag, attrs)
                elif nextContext == 'block':
                    #close the whole table
                    self.writeEndTag('tr')
                    self.writeEndTag('table')
                    self.writeStartTag(tag, attrs)
                else:
                    pass
            elif context == 'td':
                #brutal for now, no tags allowed here yet, but
                #should be like inline.
                if nextContext == 'table':
                    #got a tr - close the td
                    self.writeEndTag('td')
                    self.writeEndTag('tr')
                    self.writeStartTag(tag, attrs)
                #elif nextContext == 'block':
                #    pass
                else:
                    pass
            elif context == 'list':  #tag is li
                if nextContext == 'list':   #got another li, close the last li
                    self.writeEndTag('li')
                    self.writeStartTag(tag, attrs)
                elif nextContext == 'block':
                    self.closeCurrentBlock()
                    self.writeStartTag(tag, attrs)
                else:
                    pass
            elif context == 'li':   #tag is li
                if nextContext == 'list':   #got another li, close the last li
                    self.writeEndTag('li')
                    self.writeStartTag(tag, attrs)
                elif nextContext == 'block':
                    self.writeEndTag('li')
                    self.closeCurrentBlock()
                    self.writeStartTag(tag, attrs)
                else:
                    pass
            else:
                raise ValueError("unexpected context '%s'" % context)

isTagAllowedInContext(tag, context)

Is the tag allowed here?

g = Cleaner() g.isTagAllowedInContext('b','block') False g.isTagAllowedInContext('a','inline') True

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def isTagAllowedInContext(self, tag, context):
    """Is the tag allowed here?

    >>> g = Cleaner()
    >>> g.isTagAllowedInContext('b','block')
    False
    >>> g.isTagAllowedInContext('a','inline')
    True
    """
    return context in self._tagDict[tag]['context']

pendingTag()

What tag is waiting?

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def pendingTag(self):
    "What tag is waiting?"
    try:
        return self.openTagStack[-1][0]
    except IndexError:
        return None

process(markup)

The main loop - call this with your markup

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def process(self, markup):
    "The main loop - call this with your markup"
    markup = self.asUnicode(markup)
    for s in self.substitutions:
        markup=s(markup)
    markup = re.sub('<([A-Za-z]+\\w*)/>', '<\\1 />', markup)
    markup = nakedAmpFix(markup.strip())
    self.reset()
    markup = self.asUnicode(markup)
    self.feed(markup)
    self.close()
    r = ''.join(self.buf)
    return r.encode(self.encoding) if self.encoding else r

reset()

get ready to do some work

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def reset(self):
    "get ready to do some work"
    HTMLParser.reset(self)
    self.buf = []   #holds output
    self.fixes = []  #holds warnings / debug messages / fixups done
    self.openTagStack = []      #checks for balancing
    self._started = False
    self._currentBlockTag = None   #what kind of block tag are we inside?  Usually <p>
    self._justAfterEntity = False   #flag to say if the last thing we saw was an entity.  Used to detect doubled entities in input

tagsAllowedInContext(context)

Set of tag names allowed in this context

g = Cleaner() eqCheck(g.tagsAllowedInContext('table'),set(['tr'])) eqCheck(g.tagsAllowedInContext('inline'),set(['em', 'a', 'b', 'sub', 'img', 'i', '', 'br', 'sup', 'strong','u']))

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def tagsAllowedInContext(self, context):
    """Set of tag names allowed in this context

    >>> g = Cleaner()
    >>> eqCheck(g.tagsAllowedInContext('table'),set(['tr']))
    >>> eqCheck(g.tagsAllowedInContext('inline'),set(['em', 'a', 'b', 'sub', 'img', 'i', '', 'br', 'sup', 'strong','u']))
    """
    #special case - extreme table removal!
    if context == 'table' and not self.allowTables:
        return []

    return self._contextDict[context]['canContain']

unescape(s)

overrides entity handling in attributeValues

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def unescape(self,s):
    '''overrides entity handling in attributeValues'''
    return s

writeData(text)

Used to write out non-tag content

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def writeData(self, text):
    "Used to write out non-tag content"
    if self.remainingLines != None and self.remainingLines <= 0:
        return
    self.textLength += len(text)
    self.buf.append(text)

writeEndTag(tag)

Close the tag, but check for nesting errors.

Never write to the buffer directly; this keeps the stack and mode organised.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def writeEndTag(self, tag):
    """Close the tag, but check for nesting errors.

    Never write to the buffer directly; this keeps the stack
    and mode organised."""
    if not self.enoughSpace(tag):
        return
    try:
        lastTag,lastContext = self.openTagStack.pop()
    except:
        print(self.openTagStack)
        raise
    if tag == 'table':
        if not self.allowTables:
            return
        if not self.currentTableHasContent():
            #remove everything inside the present table
            while True:
                popped = self.buf.pop()
                if popped.startswith('<table'):
                    break
            return

    #prefilter to remove all block tags in inline markup
    if self.target == 'inline':
        if tag not in self.valid_inline_tags:
            return
    if lastTag != tag:
        raise ValueError("stack is messed up trying to close %s; current open tag was %s" % (tag, lastTag))
    self.buf.append('</%s>' % tag)

writeStartTag(tag, attrs={})

Helper to do what it says. Called to write a tag to output.

Never write your own tags to output; instead call this. This will maintain a stack and ensure they are balanced. It also sets the mode every time for you.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
def writeStartTag(self, tag, attrs={}):
    """Helper to do what it says.  Called to write a tag to output.

    Never write your own tags to output; instead call this.  This will
    maintain a stack and ensure they are balanced. It also sets the
    mode every time for you."""
    #for table removal, we just don't write it out. It's easier
    #to have writeStartTag called (from several places) because we
    #need to keep track of the fact that we are in a table-to-be-removed.
    if self.remainingLines != None and self.remainingLines <= 0:
        return
    if tag == 'table' and not self.allowTables:
        self.openTagStack.append((tag,'table'))
        return

    #self.dump()
    #prefilter to remove all block tags in inline markup

    if tag not in self.valid_inline_tags:
        if self.target == 'inline':
            return

    adict = dict(attrs)

    if tag == 'img' and self.imgSrcTr:
        if isinstance(self.imgSrcTr, str):
            p = os.path.join(self.imgSrcTr, os.path.split(adict['src'])[-1])
            p = p.replace('\\', '/')
            adict['src'] = p
        else:
            adict['src'] = self.imgSrcTr(adict['src'])

    if tag == 'a' and self.aHrefTr:
        href = adict['href']
        if isinstance(self.aHrefTr, str):
            if not (href.startswith('http://') or href.startswith('https://')):
                adict['href'] = self.aHrefTr.rstrip('/') + '/' + href.lstrip('/')
        else:
            adict['href'] = self.aHrefTr(href)

    attrs = [ (k, adict[k]) for k, _ in attrs ]

    allowedAttrs = self.allowedAttrs(tag)
    forcedAttrs = self.forcedAttrs(tag)
    selfClosing = self._tagDict[tag]['selfClosing']
    #if selfClosing: print "found self-closing tag %s" % tag

    #rebuild the tag as a piece of text
    tagBits = ['<']
    tagBits.append(tag)
    for k, v in attrs:
        if (k in allowedAttrs or self.special_attr(k)) and v is not None:
            v = self.asUnicode(v)
            if k[0:2].lower() != 'on' and v[0:10].lower() != 'javascript':
                tagBits.append(' %s="%s"' % (k, v))

    # If there are any forced attributes
    if forcedAttrs and len(forcedAttrs) > 0:
        tag_attrs = [k for k,v in attrs]
        for k in forcedAttrs:
            if k not in tag_attrs:
                tagBits.append(' %s=""'% k)
    if selfClosing:
        tagBits.append('/>')
    else:
        tagBits.append('>')
    tagText = ''.join(tagBits)

    self.buf.append(tagText)

    #and put it on the stack....
    if not selfClosing:
        context = self.context  #current context
        #if block, remember how to close
        if context == 'block':
            self._currentBlockTag = (tag,'block')
        #set the mode
        if tag == 'table':
            ncontext = 'table'
        elif tag == 'tr':
            ncontext = 'tr'
        elif tag in ('td', 'th'):
            ncontext = 'td'
        elif tag in ('ul', 'ol'):
            ncontext = 'list'
        elif tag == 'li':
            ncontext = 'li'
        else:
            #block and inline always lead to inline
            ncontext = 'inline'
        self.openTagStack.append((tag,ncontext))

cleanBlocks(input, **options)

Accept markup as one or more blocks.

The output of this should be safe for use within a

or tag in HTML, and also convertible to RML.
Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
@unifunc
def cleanBlocks(input, **options):
    """Accept markup as one or more blocks.

    The output of this should be safe for use within a
    <div> or <body> tag in HTML, and also convertible to RML.

    """
    return Cleaner(target='block', **options).process(input)

cleanInline(input, **options)

Accept and normalize markup for use inline.

The output of this should be safe for use within a

tag in HTML, and also convertible to RML.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
@unifunc
def cleanInline(input, **options):
    """Accept and normalize markup for use inline.

    The output of this should be safe for use within a
    <p> tag in HTML, and also convertible to RML.
    """
    return Cleaner(target='inline', **options).process(input)

cleanPlain(input, **options)

Remove all tags to output plain text.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
@unifunc
def cleanPlain(input, **options):
    """Remove all tags to output plain text.

    """
    return escape(stripTags(input))

filterRE(s, r)

Substitutes the matches of r in str with an empty string.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
@unifunc
def filterRE(s, r):
    'Substitutes the matches of r in str with an empty string.'
    try:
        sub = r.sub
    except AttributeError:
        return re.sub(r, '', s)
    return sub('', s)

fixTruncated(s)

Try to remove truncated tags at the end of str.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
@unifunc
def fixTruncated(s):
    'Try to remove truncated tags at the end of str.'
    return filterRE(s, truncated_tag)

truncateHTML(input, maxLines, **options)

Truncates html to a maximum of maxlength characters. Tags don't count towards the character count. Lists, tables and other big blocks get removed completed if the character limit is reached inside.

Source code in lib/python3.13/site-packages/rlextra/radxml/html_cleaner.py
@unifunc
def truncateHTML(input, maxLines, **options):
    '''
    Truncates html to a maximum of maxlength characters.
    Tags don't count towards the character count.
    Lists, tables and other big blocks get removed completed if
    the character limit is reached inside.
    '''
    return Cleaner(breaksAllowed=False, maxLines=maxLines, **options).process(input)

functions in xhtml2rml

xhtml2rml(xml, paraStyle='normal', tableStyle='noPaddingStyle', bulletStyle='bullet', pathTransform=None, imageTransformKwds={}, allowMailtoLinks=False, useModernLists=True, ulStyle=None, olStyle=None, liParaStyle=None, tagAttrs={},)

Convert chunk of our mini-html to RML.