Truncate HTML to a certain number of words

import re def truncate_html_words(self, num): """ Truncates html to a certain number of words (not counting tags and comments). Closes opened tags if they were correctly closed in the given html. """ s=self.get_intro() length = int(num) if length <= 0: return '' html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input') # Set up regular expressions re_words = re.compile(r'&.*?;|<.*?>|([A-Za-z0-9][\w-]*)') re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>') # Count non-HTML words and keep note of open tags pos = 0 ellipsis_pos = 0 words = 0 open_tags = [] while words <= length: m = re_words.search(s, pos) if not m: # Checked through whole string break pos = m.end(0) if m.group(1): # It's an actual non-HTML word words += 1 if words == length: ellipsis_pos = pos continue # Check for tag tag = re_tag.match(m.group(0)) if not tag or ellipsis_pos: # Don't worry about non tags or tags after our truncate point continue closing_tag, tagname, self_closing = tag.groups() tagname = tagname.lower() # Element names are always case-insensitive if self_closing or tagname in html4_singlets: pass elif closing_tag: # Check for match in open tags list try: i = open_tags.index(tagname) except ValueError: pass else: # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags open_tags = open_tags[i+1:] else: # Add it to the start of the open tags list open_tags.insert(0, tagname) if words <= length: # Don't try to close tags if we don't need to truncate return s out = s[:ellipsis_pos] + ' ...' # Close any tags still open for tag in open_tags: out += '</%s>' % tag # Return string return out
Truncates html to a certain number of words (not counting tags and comments).
Closes opened tags if they were correctly closed in the given html.

Be the first to comment

You can use [html][/html], [css][/css], [php][/php] and more to embed the code. Urls are automatically hyperlinked. Line breaks and paragraphs are automatically generated.