#!/usr/bin/env python # -*- coding: UTF-8 -*- """ Title: Basic Wiki engine Author: Will Hardy Project: Content manager Date: September 2007 Test Suite: python ./wiki_engine.py $Revision: 206 $ Copyright: Will Hardy 2007, released under GPL version 3 (see licence.txt) Description: A simple 'wiki' engine, providing useful tools for converting plain text into HTML. The wiki engine can be used as a filter in Django. dewiki: parses input and creates HTML links, quotes, dashes, email addresses and image filenames. paragraphs: parses input and creates HTML paragraphs, headings, lists and definitions. number_format: for floats that need special formatting. urlify: converts a string into an equivalent URL friendly version. isRedirect: given wiki text, it will determine if it is a redirect page. improvements: * An intermediate format (i.e. not HTML) breaks our simplicity, but is faar more flexible and good for the long term. This would allow such wonderful things as LaTeX output of wiki text and so on. """ from django import template from django.template.defaultfilters import stringfilter import locale __all__ = ('dewiki', 'paragraphs', 'number_format', 'urlify', 'isRedirect') register = template.Library() #@stringfilter # Breaks doctest!! @register.filter def dewiki(value): """ Converts wiki style link markup to HTML. The following examples form the unit test cases: >>> dewiki('[site.com Site Name]') 'Site Name' >>> dewiki('[http://site.com Site Name]') 'Site Name' >>> dewiki('[https://site.com Site Name]') 'Site Name' >>> dewiki('[site.com]') 'site.com' >>> dewiki('[http://site.com]') 'http://site.com' >>> dewiki('[https://site.com]') 'https://site.com' >>> dewiki('[[localpage|Local Page]]') 'Local Page' >>> dewiki('[[localpage/7|Local Page]]') 'Local Page' >>> dewiki('[[local page|Local Page]]') 'Local Page' >>> dewiki('[google.com#reference Google Page]') 'Google Page' >>> dewiki('[http://google.com#reference Google Page]') 'Google Page' >>> dewiki('[http://localhost:8000 localhost]') 'localhost' >>> dewiki('[[localpage#reference|Local Page]]') 'Local Page' >>> dewiki('[[#reference|On this page]]') 'On this page' >>> dewiki('Simple "quote" usage.') 'Simple “quote” usage.' >>> dewiki('"At the beginning and end"') '“At the beginning and end”' >>> dewiki('First line\\n"At the beginning and end of a line"\\n') 'First line\\n“At the beginning and end of a line”\\n' >>> dewiki('also "at the end of a sentence".') 'also “at the end of a sentence”.' >>> dewiki('more than "one" set of "quotes".') 'more than “one” set of “quotes”.' >>> dewiki("explicit double`` ''quotes") 'explicit double“ ”quotes' >>> dewiki("explicit single` quotes") 'explicit single‘ quotes' >>> dewiki('text - and some more text.') 'text — and some more text.' >>> dewiki('see pages 67-8.') 'see pages 67–8.' >>> dewiki('explicit em-dash ---') 'explicit em-dash —' >>> dewiki('explicit en-dash --') 'explicit en-dash –' >>> dewiki(u'Unicode macht Spa\\xdf') u'Unicode macht Spa\\xdf' >>> dewiki(u'[www.fu-berlin.de Freie Universit\\xe4t]') u'Freie Universit\\xe4t' >>> dewiki('The first link [[home]] and some more text') 'The first link home and some more text' >>> dewiki('The first link [[home]] and another link [[blog]]') 'The first link home and another link blog' >>> dewiki('''Do not replace "quotes" in HTML tags.''') 'Do not replace “quotes” in HTML tags.' >>> dewiki('[[|home]]') 'home' >>> dewiki('Here is an image_123-b.jpg, which should be shown') 'Here is an image_123-b, which should be shown' >>> dewiki('Here is an image.png (with caption), which should be shown') 'Here is an with caption, which should be shown' # 'Here is an
with caption
with caption
, which should be shown' >>> dewiki('Here is another link which was hand written.') 'Here is another link which was hand written.' # FUTURE >>> dewiki('Text
with "quotes"
') # FUTURE 'Text
with "quotes"
' # FUTURE >>> dewiki('Text with "quotes"') # FUTURE 'Text with "quotes"' """ import re re_flags = re.UNICODE # QUOTES: single (‘ ’) double (“ ”) ############################################################################ # In the future, it may be good to not do this inside tags, within tags # or within
 tags. That isn't an issue at the moment.
    # Within  tags should also be allowed

    # 1. explicit left and right double quotes (``, '')
    # I'm toying with the idea of disallowing automatic quote substitutions if
    # the text has explicit quotes. This means an all or nothing option for
    # explicit quoting.
    value = value.replace("``", "“")
    value = value.replace("''", "”")

    # 2. Double quotes before a word (not following an equals sign: e.g. href="...)
    left_double_quotes = re.compile(r'([^\w=]|^)"(\w)', re_flags)
    value = left_double_quotes.sub(r'\1“\2', value)

    # 3. Double quotes after a word (not preceding a closing >, for tags)
    right_double_quotes = re.compile(r'(\w)"([^\w>]|$)', re_flags)
    value = right_double_quotes.sub(r'\1”\2', value)

    # 4. Single quotes before a word and explicit left dingle quotes (`)
    left_single_quotes = re.compile(r"([^\w])'", re_flags)
    value = left_single_quotes.sub(r'\1‘', value)
    value = value.replace("`", "‘")

    # 5. Simply make the rest right single quotes
    value = value.replace("'", "’")

    # 6. m-dash is some text - followed by a dash - and then some more text
    # It can be explicitly called using three dashes: "---"
    m_dash = re.compile(r"(\w\s+)-(\s+\w)", re_flags)
    value = m_dash.sub(r'\1—\2', value)
    value = value.replace("---", "—")

    # 7. n-dash is a dash between numbers on pages 34-5. But this is not minus.
    # It can be explicitly called using two dashes: "--"
    # This may be confused with a figure dash (used in phone numbers). 
    # This is not an issue for me.
    n_dash = re.compile(r"(\d)-(\d)", re_flags)
    value = n_dash.sub(r'\1–\2', value)
    value = value.replace("--", "–")

    # 8. Ellipsis
    value = value.replace("...", "…")

    # INTERNAL LINKS
    # e.g. [[page#ref|text]] -> text
    ############################################################################

    internal_links_re = re.compile(r'\[\[(?P[\w/ _]+)?(?P#[\w _]+)?(\|(?P[\w ]+))?\]\]', re_flags)
    match = internal_links_re.search(value)
    while match:
        # If there is a page reference given, convert it to a url
        if match.group("page"):
            url = '/%s/%s' % (urlify(match.group("page"), keep_slashes=True), match.group("ref") or "")
        else:
            url = match.group("ref") or "/"
        text = match.group("text") or match.group("page")
        # The full substitute link
        substitution = '%s' % (url, text)

        # Substitute into our value
        value = value[:match.start(0)] + substitution + value[match.end(0):]

        # Look for another match
        match = internal_links_re.search(value)

    # EXTERNAL LINKS
    # e.g. [prefix://url#ref text] -> text
    ############################################################################

    external_links_re = re.compile(r'\[(?P(http|https|ftp)://)?(?P[\w\d.:/#%_-]+)( (?P[^\]]+))?\]', re_flags)
    match = external_links_re.search(value)
    while match:
        # Work out the url with a prefix
        prefix = match.group("prefix") or "http://"
        url = prefix + match.group("url")
        # The given name or the given url
        name = match.group("name") or (match.group("prefix") or "") + match.group("url")
        substitution = '%s' % (url, name)

        # Substitute into our value
        value = value[:match.start(0)] + substitution + value[match.end(0):]

        # Look for another match
        match = external_links_re.search(value)

    # EMAIL ADDRESSES
    # e.g. anything@email.com
    ############################################################################
    # Obfuscates email addresses, with progressive enhancement.
    # Uses CSS to remove additional characters, uses javascript to create mailto link
    email_obfuscate = r'''\1@\2'''
    # This is difficult, it should use escapes, to keep it valid xhtml.
    #email_obfuscate = r''''''
    email_addresses = re.compile(r'([^\s@]+)@([^\s@]+)', re_flags)
    value = email_addresses.sub(email_obfuscate, value)

    ## Phone numbers: +12 345 67 89 but not 1234
    ## This isn't for everybody
    #phone_numbers = re.compile(r'(\+?\d[\d ]+\d\d[\d ]+\d)', re_flags)
    #value = phone_numbers.sub(r'\1', value)


    # IMAGES
    ##########################################################################
    # It would be good to validate if the image is in the database, and use the description text there.

    IMAGE_DIRECTORY = '/media/images/' # Should get this from somewhere else
    odd_item = True

    images = re.compile(r'(?P(?P[^\s]+)\.(jpg|png|JPG|PNG))(?P[^"])(\((?P[^)]+)\))?', re_flags)
    match = images.search(value)
    while match:
        # Work out our values
        image_filename = match.group("filename") or ""
        caption = match.group("caption") or ""
        image_alt = caption or match.group("basename") or image_filename
        sep = match.group("sep") or ""
        if odd_item:
          odd_even = "odd"
          odd_item = False
        else:
          odd_even = "even"
          odd_item = True

        #if caption:
          #substitution = '
%s
%s
' % ( #odd_even, IMAGE_DIRECTORY, image_filename, image_alt, caption) #else: substitution = '%s%s' % (IMAGE_DIRECTORY, image_filename, image_alt, sep) # Substitute into our value value = value[:match.start(0)] + substitution + value[match.end(0):] # Look for another match match = images.search(value) return value #@stringfilter # Breaks doctest! @register.filter def paragraphs(input, heading_level=2): """ Converts wiki style link markup to HTML: paragraphs and headings. NOTE: This becomes reasonably complicated, when you really want to have smart processing, as well as explicit markup. Regular expressions may not suffice for such a task. The following examples form the unit test cases: >>> paragraphs('''Here is a paragraph.\\n\\nHere is another.''') u'

Here is a paragraph.

Here is another.

' >>> paragraphs('Here is a heading\\nHere is the main text.') u'

Here is a heading

Here is the main text.

' >>> paragraphs('Two lines separated by a newline.\\ncharacter.') u'

Two lines separated by a newline.

character.

' >>> paragraphs('Here is not a heading because it is too long, it is probably a mistake, a line without an end\\n\\nHere is the main text.') u'

Here is not a heading because it is too long, it is probably a mistake, a line without an end

Here is the main text.

' >>> paragraphs('Here is a heading\\nHere is the main text.\\n\\nHere is some more text.') u'

Here is a heading

Here is the main text.

Here is some more text.

' >>> paragraphs(' Here is not a heading\\nbecause it had a space at the beginning.') u'

Here is not a heading

because it had a space at the beginning.

' # ALTERNATIVE VERSION >>> paragraphs('Here is a definition: and some more text.') # ALTERNATIVE VERSION u'
Here is a definition:and some more text.
' >>> paragraphs('Here is a definition: and some more text.') u'
Here is a definition:
and some more text.
' # ALTERNATIVE VERSION >>> paragraphs('Here is a definition: and some more text.\\nHere is another: and here is the longer text of that definition.') # ALTERNATIVE VERSION u'
Here is a definition:and some more text.
Here is another:and here is the longer text of that definition.
' >>> paragraphs('Here is a definition: and some more text.\\nHere is another: and here is the longer text of that definition.') u'
Here is a definition:
and some more text.
Here is another:
and here is the longer text of that definition.
' >>> paragraphs('Here is not a definition because it is more like a paragraph with a colon: and more text.') u'

Here is not a definition because it is more like a paragraph with a colon: and more text.

' >>> paragraphs('Here is not a definition colon in URL.') u'

Here is not a definition colon in URL.

' >>> paragraphs('* Here is an unordered list\\n*With another line\\n* And another') u'
  • Here is an unordered list
  • With another line
  • And another
' >>> paragraphs('\\n* Here is an unordered list\\n*With another line\\n* And another') u'
  • Here is an unordered list
  • With another line
  • And another
' >>> paragraphs('Here is a list:\\n* Here is an unordered list\\n*With another line\\n* And another') u'
Here is a list:
  • Here is an unordered list
  • With another line
  • And another
' >>> paragraphs('Melbourne University, Australia (current):\\n*Bachelor of Software Engineering') u'
Melbourne University, Australia (current):
' """ """ Break into lines and process line by line, keep track of some states. Improvement: Don't just count characters, count number of (long) words """ # Parameters #MAX_HEADING_LENGTH = 50 #MAX_DEFINITION_LENGTH = 40 MAX_HEADING_WORDS = 6 MAX_DEFINITION_WORDS = 5 MIN_WORD_LENGTH = 4 # States DEFINITION, UNORDERED_LIST, ORDERED_LIST = 1, 2, 3 current_state = None result = "" closing_tag = "" # To allow multiple lines to be grouped together for line in input.splitlines(): original_line = line line = line.strip() words = getWords(line) # Words without HTML tags number_words = sum([ len(word) >= MIN_WORD_LENGTH for word in words.split() ]) # Unordered list (first character is a star) if line[:1] == "*": if current_state != UNORDERED_LIST: result += u'
    ' closing_tag = u'
' + closing_tag current_state = UNORDERED_LIST result += u'
  • %s
  • ' % "".join(line[1:].strip()) # A definition (short line with colon (colon not at end)) elif ":" in words and 0 < countWords(words.split(":",1)[0]) <= MAX_DEFINITION_WORDS: if current_state != DEFINITION: result += closing_tag + u'
    ' closing_tag = u'
    ' current_state = DEFINITION else: result += '' definition = splitWords(line, ":", 1) # Ignore HTML tags result += u'
    %s:
    %s' % (definition[0], "".join(definition[1:]).strip()) # A heading elif 0 < number_words < MAX_HEADING_WORDS and "".join(line[-1:]) not in '.:' and "".join(original_line[:1]) != " ": result += u'%s%s' % (closing_tag, int(heading_level), line, int(heading_level)) closing_tag = "" current_state = None # Otherwise, just do a paragraph elif len(line) > 0: result += u'%s

    %s

    ' % (closing_tag, line) closing_tag = "" current_state = None else: current_state = None return result + closing_tag @register.filter def number_format(string, digits=12, decimal=2): try: locale.setlocale(locale.LC_ALL, "") #format_string = "%%%d.%df" % (digits, decimal) format_string = "%8.2f" return locale.format(format_string, float(string), True) except Exception: return "" @register.filter def urlify(string, max_length=50, keep_slashes=False): """ Generate a unique and valid slug. Adapted from django javascript code in django.contrib.admin: media/js/urlify.js Keep slashes allows for partial urls to be passed through: "Page name/3" -> "page-name/3" The following examples form the unit test cases: >>> urlify('ChangeCase') 'changecase' >>> urlify('Remove Space') 'remove-space' >>> urlify(' Leading Space') 'leading-space' >>> urlify('Trailing Spaces ') 'trailing-spaces' >>> urlify(' Much space ') 'much-space' >>> urlify('Extra - hyphen') 'extra-hyphen' >>> urlify('-extra-- hyphens -') 'extra-hyphens' >>> urlify('Illegal, bad. Characters?') 'illegal-bad-characters' >>> urlify('A banned word or two but not before legal ones') 'banned-word-or-two-not-legal-ones' >>> urlify('Too many characters in this sentence, it has more than fifty.') 'too-many-characters-this-sentence-it-has-more-fift' >>> urlify('Too many characters in this sentence', max_length=10) 'too-many-c' >>> urlify('Too many characters, with word break at limit', max_length=9) 'too-many' >>> urlify(u'Unicode macht Spa\\xdf') u'unicode-macht-spa\\xdf' >>> urlify(u'Uppercase Unicode \\xc4nderung') u'uppercase-unicode-\\xe4nderung' >>> urlify('') '' >>> urlify('\\n') '' >>> urlify('-') '' """ # TODO: max_length: it would be better to stop at a word break, rather than immediately # Unicode policy: should we be allowing unicode in the URL? This can be problematic with browser support. import re re_flags = re.UNICODE # some banned words banned_list = [ "a", "an", "as", "at", "before", "but", "by", "for", "from", "is", "in", "into", "like", "of", "off", "on", "onto", "per", "since", "than", "the", "this", "that", "to", "up", "via", "with", ] banned_re = re.compile(r'(^|\s+)(%s)(\s+|$)' % '|'.join(banned_list), re_flags) if keep_slashes: banned_chars = re.compile(r'[^-\w\s/]', re_flags) else: banned_chars = re.compile(r'[^-\w\s]', re_flags) slug = string.lower() # convert to lowercase slug = banned_chars.sub('', slug) # remove banned words slug = banned_re.sub(' ', slug) # remove banned words slug = re.sub(r'[-\s]+', '-', slug) # convert spaces to single hyphens slug = slug[0:max_length] # trim to first num_chars chars slug = slug.strip() # remove leading and trailing space slug = slug.strip("-") # remove leading and trailing hyphens return slug def isRedirect(text): """ Determines if this text is a wiki redirection or not. Improvements: * Not yet sure if the links should be validated before being returned. * 'internal' and 'external' might be better implemented as bool or int, to speed up comparisons. >>> isRedirect('#REDIRECT [[test]]') ('internal', 'test') >>> isRedirect('#REDIRECT [test]') ('external', 'test') >>> isRedirect('not a redirect') """ # This redirection test should be contained in the wiki engine, as it relates to wiki syntax redirect_string = "#REDIRECT " if text[0:len(redirect_string)] == redirect_string: target = text[len(redirect_string):] # handle an internal link: [[blog]] if target[:2] == '[[' and target[-2:] == ']]': name = target[2:-2] return ('internal', name) # handle an external link: [http://testset.com] elif target[:1] == '[' and target[-1:] == ']': link = target[1:-1] # ?Validate link before redirect? return ('external', link) return None # HELPER FUNTIONS ################################################################################ def countWords(string, min_word_length=4): """ Takes a HTML string a counts the number of words. Simply runs through the characters and counts all the change of states from characters to spaces. HTML tags are ignored, but there still needs to be a space between two words. Small words are not included. IMPROVEMENTS: Use the same banned word list used by Django and in urlify() >>> countWords('Here is a longish string of simple words.', 4) 5 >>> countWords('Here is a longish string of simple words.', 4) 5 """ #return len(getWords(string).split()) return sum([ len(word) >= min_word_length for word in getWords(string).split() ]) def getWords(string): """ Gets words without the HTML tags. """ output_string = "" in_word = False in_tag = False for char in string: if char == "<": in_tag = True elif in_tag and char == ">": in_tag = False elif not in_tag: output_string += char return output_string def splitWords(string, split, maxsplit=None): """ Like split(), but ignores any items in HTML tags. >>> splitWords('Here is my String', ":") ['Here is my String'] >>> splitWords('Here is my String: with colon', ":") ['Here is my String', ' with colon'] >>> splitWords("", ":") [''] """ result_list = [] current_word = "" in_word = False in_tag = False for char in string: if char == "<": in_tag = True elif in_tag and char == ">": in_tag = False if not in_tag: if char == split and maxsplit != 0: result_list.append(current_word) current_word = "" if maxsplit is not None: maxsplit -= 1 else: current_word += char else: current_word += char result_list.append(current_word) return result_list def _test(): import doctest doctest.testmod() if __name__ == "__main__": _test()