#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
Title: Basic Wiki engine
Author: Will Hardy
Project: Content manager
Date: September 2007
Test Suite: python ./wiki_engine.py
$Revision: 206 $
Copyright: Will Hardy 2007, released under GPL version 3 (see licence.txt)
Description: A simple 'wiki' engine, providing useful tools for converting
plain text into HTML. The wiki engine can be used as a filter
in Django.
dewiki:
parses input and creates HTML links, quotes, dashes,
email addresses and image filenames.
paragraphs:
parses input and creates HTML paragraphs, headings,
lists and definitions.
number_format:
for floats that need special formatting.
urlify:
converts a string into an equivalent URL friendly version.
isRedirect:
given wiki text, it will determine if it is a redirect page.
improvements:
* An intermediate format (i.e. not HTML) breaks our simplicity,
but is faar more flexible and good for the long term. This
would allow such wonderful things as LaTeX output of wiki
text and so on.
"""
from django import template
from django.template.defaultfilters import stringfilter
import locale
__all__ = ('dewiki', 'paragraphs', 'number_format', 'urlify', 'isRedirect')
register = template.Library()
#@stringfilter # Breaks doctest!!
@register.filter
def dewiki(value):
""" Converts wiki style link markup to HTML.
The following examples form the unit test cases:
>>> dewiki('[site.com Site Name]')
'Site Name'
>>> dewiki('[http://site.com Site Name]')
'Site Name'
>>> dewiki('[https://site.com Site Name]')
'Site Name'
>>> dewiki('[site.com]')
'site.com'
>>> dewiki('[http://site.com]')
'http://site.com'
>>> dewiki('[https://site.com]')
'https://site.com'
>>> dewiki('[[localpage|Local Page]]')
'Local Page'
>>> dewiki('[[localpage/7|Local Page]]')
'Local Page'
>>> dewiki('[[local page|Local Page]]')
'Local Page'
>>> dewiki('[google.com#reference Google Page]')
'Google Page'
>>> dewiki('[http://google.com#reference Google Page]')
'Google Page'
>>> dewiki('[http://localhost:8000 localhost]')
'localhost'
>>> dewiki('[[localpage#reference|Local Page]]')
'Local Page'
>>> dewiki('[[#reference|On this page]]')
'On this page'
>>> dewiki('Simple "quote" usage.')
'Simple “quote” usage.'
>>> dewiki('"At the beginning and end"')
'“At the beginning and end”'
>>> dewiki('First line\\n"At the beginning and end of a line"\\n')
'First line\\n“At the beginning and end of a line”\\n'
>>> dewiki('also "at the end of a sentence".')
'also “at the end of a sentence”.'
>>> dewiki('more than "one" set of "quotes".')
'more than “one” set of “quotes”.'
>>> dewiki("explicit double`` ''quotes")
'explicit double“ ”quotes'
>>> dewiki("explicit single` quotes")
'explicit single‘ quotes'
>>> dewiki('text - and some more text.')
'text — and some more text.'
>>> dewiki('see pages 67-8.')
'see pages 67–8.'
>>> dewiki('explicit em-dash ---')
'explicit em-dash —'
>>> dewiki('explicit en-dash --')
'explicit en-dash –'
>>> dewiki(u'Unicode macht Spa\\xdf')
u'Unicode macht Spa\\xdf'
>>> dewiki(u'[www.fu-berlin.de Freie Universit\\xe4t]')
u'Freie Universit\\xe4t'
>>> dewiki('The first link [[home]] and some more text')
'The first link home and some more text'
>>> dewiki('The first link [[home]] and another link [[blog]]')
'The first link home and another link blog'
>>> dewiki('''Do not replace "quotes" in HTML tags.''')
'Do not replace “quotes” in HTML tags.'
>>> dewiki('[[|home]]')
'home'
>>> dewiki('Here is an image_123-b.jpg, which should be shown')
'Here is an
, which should be shown'
>>> dewiki('Here is an image.png (with caption), which should be shown')
'Here is an
, which should be shown'
# 'Here is an

with "quotes"') # FUTURE 'Text
with "quotes"' # FUTURE >>> dewiki('Text
with "quotes"')
# FUTURE 'Text with "quotes"'
"""
import re
re_flags = re.UNICODE
# QUOTES: single (‘ ’) double (“ ”)
############################################################################
# In the future, it may be good to not do this inside tags, within tags
# or within tags. That isn't an issue at the moment.
# Within tags should also be allowed
# 1. explicit left and right double quotes (``, '')
# I'm toying with the idea of disallowing automatic quote substitutions if
# the text has explicit quotes. This means an all or nothing option for
# explicit quoting.
value = value.replace("``", "“")
value = value.replace("''", "”")
# 2. Double quotes before a word (not following an equals sign: e.g. href="...)
left_double_quotes = re.compile(r'([^\w=]|^)"(\w)', re_flags)
value = left_double_quotes.sub(r'\1“\2', value)
# 3. Double quotes after a word (not preceding a closing >, for tags)
right_double_quotes = re.compile(r'(\w)"([^\w>]|$)', re_flags)
value = right_double_quotes.sub(r'\1”\2', value)
# 4. Single quotes before a word and explicit left dingle quotes (`)
left_single_quotes = re.compile(r"([^\w])'", re_flags)
value = left_single_quotes.sub(r'\1‘', value)
value = value.replace("`", "‘")
# 5. Simply make the rest right single quotes
value = value.replace("'", "’")
# 6. m-dash is some text - followed by a dash - and then some more text
# It can be explicitly called using three dashes: "---"
m_dash = re.compile(r"(\w\s+)-(\s+\w)", re_flags)
value = m_dash.sub(r'\1—\2', value)
value = value.replace("---", "—")
# 7. n-dash is a dash between numbers on pages 34-5. But this is not minus.
# It can be explicitly called using two dashes: "--"
# This may be confused with a figure dash (used in phone numbers).
# This is not an issue for me.
n_dash = re.compile(r"(\d)-(\d)", re_flags)
value = n_dash.sub(r'\1–\2', value)
value = value.replace("--", "–")
# 8. Ellipsis
value = value.replace("...", "…")
# INTERNAL LINKS
# e.g. [[page#ref|text]] -> text
############################################################################
internal_links_re = re.compile(r'\[\[(?P[\w/ _]+)?(?P#[\w _]+)?(\|(?P[\w ]+))?\]\]', re_flags)
match = internal_links_re.search(value)
while match:
# If there is a page reference given, convert it to a url
if match.group("page"):
url = '/%s/%s' % (urlify(match.group("page"), keep_slashes=True), match.group("ref") or "")
else:
url = match.group("ref") or "/"
text = match.group("text") or match.group("page")
# The full substitute link
substitution = '%s' % (url, text)
# Substitute into our value
value = value[:match.start(0)] + substitution + value[match.end(0):]
# Look for another match
match = internal_links_re.search(value)
# EXTERNAL LINKS
# e.g. [prefix://url#ref text] -> text
############################################################################
external_links_re = re.compile(r'\[(?P(http|https|ftp)://)?(?P[\w\d.:/#%_-]+)( (?P[^\]]+))?\]', re_flags)
match = external_links_re.search(value)
while match:
# Work out the url with a prefix
prefix = match.group("prefix") or "http://"
url = prefix + match.group("url")
# The given name or the given url
name = match.group("name") or (match.group("prefix") or "") + match.group("url")
substitution = '%s' % (url, name)
# Substitute into our value
value = value[:match.start(0)] + substitution + value[match.end(0):]
# Look for another match
match = external_links_re.search(value)
# EMAIL ADDRESSES
# e.g. anything@email.com
############################################################################
# Obfuscates email addresses, with progressive enhancement.
# Uses CSS to remove additional characters, uses javascript to create mailto link
email_obfuscate = r'''\1 REMOVE-THIS @ REMOVE-THIS \2'''
# This is difficult, it should use escapes, to keep it valid xhtml.
#email_obfuscate = r''''''
email_addresses = re.compile(r'([^\s@]+)@([^\s@]+)', re_flags)
value = email_addresses.sub(email_obfuscate, value)
## Phone numbers: +12 345 67 89 but not 1234
## This isn't for everybody
#phone_numbers = re.compile(r'(\+?\d[\d ]+\d\d[\d ]+\d)', re_flags)
#value = phone_numbers.sub(r'\1', value)
# IMAGES
##########################################################################
# It would be good to validate if the image is in the database, and use the description text there.
IMAGE_DIRECTORY = '/media/images/' # Should get this from somewhere else
odd_item = True
images = re.compile(r'(?P(?P[^\s]+)\.(jpg|png|JPG|PNG))(?P[^"])(\((?P[^)]+)\))?', re_flags)
match = images.search(value)
while match:
# Work out our values
image_filename = match.group("filename") or ""
caption = match.group("caption") or ""
image_alt = caption or match.group("basename") or image_filename
sep = match.group("sep") or ""
if odd_item:
odd_even = "odd"
odd_item = False
else:
odd_even = "even"
odd_item = True
#if caption:
#substitution = '
- %s
' % (
#odd_even, IMAGE_DIRECTORY, image_filename, image_alt, caption)
#else:
substitution = '
%s' % (IMAGE_DIRECTORY, image_filename, image_alt, sep)
# Substitute into our value
value = value[:match.start(0)] + substitution + value[match.end(0):]
# Look for another match
match = images.search(value)
return value
#@stringfilter # Breaks doctest!
@register.filter
def paragraphs(input, heading_level=2):
""" Converts wiki style link markup to HTML: paragraphs and headings.
NOTE: This becomes reasonably complicated, when you really want to
have smart processing, as well as explicit markup.
Regular expressions may not suffice for such a task.
The following examples form the unit test cases:
>>> paragraphs('''Here is a paragraph.\\n\\nHere is another.''')
u'Here is a paragraph.
Here is another.
'
>>> paragraphs('Here is a heading\\nHere is the main text.')
u'Here is a heading
Here is the main text.
'
>>> paragraphs('Two lines separated by a newline.\\ncharacter.')
u'Two lines separated by a newline.
character.
'
>>> paragraphs('Here is not a heading because it is too long, it is probably a mistake, a line without an end\\n\\nHere is the main text.')
u'Here is not a heading because it is too long, it is probably a mistake, a line without an end
Here is the main text.
'
>>> paragraphs('Here is a heading\\nHere is the main text.\\n\\nHere is some more text.')
u'Here is a heading
Here is the main text.
Here is some more text.
'
>>> paragraphs(' Here is not a heading\\nbecause it had a space at the beginning.')
u'Here is not a heading
because it had a space at the beginning.
'
# ALTERNATIVE VERSION >>> paragraphs('Here is a definition: and some more text.')
# ALTERNATIVE VERSION u'Here is a definition: and some more text.
'
>>> paragraphs('Here is a definition: and some more text.')
u'- Here is a definition:
- and some more text.
'
# ALTERNATIVE VERSION >>> paragraphs('Here is a definition: and some more text.\\nHere is another: and here is the longer text of that definition.')
# ALTERNATIVE VERSION u'Here is a definition: and some more text. Here is another: and here is the longer text of that definition.
'
>>> paragraphs('Here is a definition: and some more text.\\nHere is another: and here is the longer text of that definition.')
u'- Here is a definition:
- and some more text.
- Here is another:
- and here is the longer text of that definition.
'
>>> paragraphs('Here is not a definition because it is more like a paragraph with a colon: and more text.')
u'Here is not a definition because it is more like a paragraph with a colon: and more text.
'
>>> paragraphs('Here is not a definition colon in URL.')
u'Here is not a definition colon in URL.
'
>>> paragraphs('* Here is an unordered list\\n*With another line\\n* And another')
u'- Here is an unordered list
- With another line
- And another
'
>>> paragraphs('\\n* Here is an unordered list\\n*With another line\\n* And another')
u'- Here is an unordered list
- With another line
- And another
'
>>> paragraphs('Here is a list:\\n* Here is an unordered list\\n*With another line\\n* And another')
u'- Here is a list:
- Here is an unordered list
- With another line
- And another
'
>>> paragraphs('Melbourne University, Australia (current):\\n*Bachelor of Software Engineering')
u'- Melbourne University, Australia (current):
'
"""
""" Break into lines and process line by line, keep track of some states.
Improvement: Don't just count characters, count number of (long) words
"""
# Parameters
#MAX_HEADING_LENGTH = 50
#MAX_DEFINITION_LENGTH = 40
MAX_HEADING_WORDS = 6
MAX_DEFINITION_WORDS = 5
MIN_WORD_LENGTH = 4
# States
DEFINITION, UNORDERED_LIST, ORDERED_LIST = 1, 2, 3
current_state = None
result = ""
closing_tag = "" # To allow multiple lines to be grouped together
for line in input.splitlines():
original_line = line
line = line.strip()
words = getWords(line) # Words without HTML tags
number_words = sum([ len(word) >= MIN_WORD_LENGTH for word in words.split() ])
# Unordered list (first character is a star)
if line[:1] == "*":
if current_state != UNORDERED_LIST:
result += u''
closing_tag = u'
' + closing_tag
current_state = UNORDERED_LIST
result += u'%s ' % "".join(line[1:].strip())
# A definition (short line with colon (colon not at end))
elif ":" in words and 0 < countWords(words.split(":",1)[0]) <= MAX_DEFINITION_WORDS:
if current_state != DEFINITION:
result += closing_tag + u''
closing_tag = u'
'
current_state = DEFINITION
else:
result += ''
definition = splitWords(line, ":", 1) # Ignore HTML tags
result += u'%s: %s' % (definition[0], "".join(definition[1:]).strip())
# A heading
elif 0 < number_words < MAX_HEADING_WORDS and "".join(line[-1:]) not in '.:' and "".join(original_line[:1]) != " ":
result += u'%s%s ' % (closing_tag, int(heading_level), line, int(heading_level))
closing_tag = ""
current_state = None
# Otherwise, just do a paragraph
elif len(line) > 0:
result += u'%s%s
' % (closing_tag, line)
closing_tag = ""
current_state = None
else:
current_state = None
return result + closing_tag
@register.filter
def number_format(string, digits=12, decimal=2):
try:
locale.setlocale(locale.LC_ALL, "")
#format_string = "%%%d.%df" % (digits, decimal)
format_string = "%8.2f"
return locale.format(format_string, float(string), True)
except Exception:
return ""
@register.filter
def urlify(string, max_length=50, keep_slashes=False):
""" Generate a unique and valid slug.
Adapted from django javascript code in django.contrib.admin: media/js/urlify.js
Keep slashes allows for partial urls to be passed through: "Page name/3" -> "page-name/3"
The following examples form the unit test cases:
>>> urlify('ChangeCase')
'changecase'
>>> urlify('Remove Space')
'remove-space'
>>> urlify(' Leading Space')
'leading-space'
>>> urlify('Trailing Spaces ')
'trailing-spaces'
>>> urlify(' Much space ')
'much-space'
>>> urlify('Extra - hyphen')
'extra-hyphen'
>>> urlify('-extra-- hyphens -')
'extra-hyphens'
>>> urlify('Illegal, bad. Characters?')
'illegal-bad-characters'
>>> urlify('A banned word or two but not before legal ones')
'banned-word-or-two-not-legal-ones'
>>> urlify('Too many characters in this sentence, it has more than fifty.')
'too-many-characters-this-sentence-it-has-more-fift'
>>> urlify('Too many characters in this sentence', max_length=10)
'too-many-c'
>>> urlify('Too many characters, with word break at limit', max_length=9)
'too-many'
>>> urlify(u'Unicode macht Spa\\xdf')
u'unicode-macht-spa\\xdf'
>>> urlify(u'Uppercase Unicode \\xc4nderung')
u'uppercase-unicode-\\xe4nderung'
>>> urlify('')
''
>>> urlify('\\n')
''
>>> urlify('-')
''
"""
# TODO: max_length: it would be better to stop at a word break, rather than immediately
# Unicode policy: should we be allowing unicode in the URL? This can be problematic with browser support.
import re
re_flags = re.UNICODE
# some banned words
banned_list = [ "a", "an", "as", "at", "before", "but", "by", "for", "from",
"is", "in", "into", "like", "of", "off", "on", "onto", "per",
"since", "than", "the", "this", "that", "to", "up", "via",
"with",
]
banned_re = re.compile(r'(^|\s+)(%s)(\s+|$)' % '|'.join(banned_list), re_flags)
if keep_slashes:
banned_chars = re.compile(r'[^-\w\s/]', re_flags)
else:
banned_chars = re.compile(r'[^-\w\s]', re_flags)
slug = string.lower() # convert to lowercase
slug = banned_chars.sub('', slug) # remove banned words
slug = banned_re.sub(' ', slug) # remove banned words
slug = re.sub(r'[-\s]+', '-', slug) # convert spaces to single hyphens
slug = slug[0:max_length] # trim to first num_chars chars
slug = slug.strip() # remove leading and trailing space
slug = slug.strip("-") # remove leading and trailing hyphens
return slug
def isRedirect(text):
""" Determines if this text is a wiki redirection or not.
Improvements:
* Not yet sure if the links should be validated before being returned.
* 'internal' and 'external' might be better implemented as bool or int, to speed up comparisons.
>>> isRedirect('#REDIRECT [[test]]')
('internal', 'test')
>>> isRedirect('#REDIRECT [test]')
('external', 'test')
>>> isRedirect('not a redirect')
"""
# This redirection test should be contained in the wiki engine, as it relates to wiki syntax
redirect_string = "#REDIRECT "
if text[0:len(redirect_string)] == redirect_string:
target = text[len(redirect_string):]
# handle an internal link: [[blog]]
if target[:2] == '[[' and target[-2:] == ']]':
name = target[2:-2]
return ('internal', name)
# handle an external link: [http://testset.com]
elif target[:1] == '[' and target[-1:] == ']':
link = target[1:-1]
# ?Validate link before redirect?
return ('external', link)
return None
# HELPER FUNTIONS
################################################################################
def countWords(string, min_word_length=4):
"""
Takes a HTML string a counts the number of words.
Simply runs through the characters and counts all the change of states from characters to spaces.
HTML tags are ignored, but there still needs to be a space between two words.
Small words are not included.
IMPROVEMENTS: Use the same banned word list used by Django and in urlify()
>>> countWords('Here is a longish string of simple words.', 4)
5
>>> countWords('Here is a longish string of simple words.', 4)
5
"""
#return len(getWords(string).split())
return sum([ len(word) >= min_word_length for word in getWords(string).split() ])
def getWords(string):
""" Gets words without the HTML tags. """
output_string = ""
in_word = False
in_tag = False
for char in string:
if char == "<":
in_tag = True
elif in_tag and char == ">":
in_tag = False
elif not in_tag:
output_string += char
return output_string
def splitWords(string, split, maxsplit=None):
""" Like split(), but ignores any items in HTML tags.
>>> splitWords('Here is my String', ":")
['Here is my String']
>>> splitWords('Here is my String: with colon', ":")
['Here is my String', ' with colon']
>>> splitWords("", ":")
['']
"""
result_list = []
current_word = ""
in_word = False
in_tag = False
for char in string:
if char == "<":
in_tag = True
elif in_tag and char == ">":
in_tag = False
if not in_tag:
if char == split and maxsplit != 0:
result_list.append(current_word)
current_word = ""
if maxsplit is not None:
maxsplit -= 1
else:
current_word += char
else:
current_word += char
result_list.append(current_word)
return result_list
def _test():
import doctest
doctest.testmod()
if __name__ == "__main__":
_test()