Clean-up the imported text based on the Slovak typographic rules

A common problem when receiving text from other people is that it is often not as consistently formatted as you might like. Some common things that need to be fixed are: Fixing these items manually can become very time consuming. This script will help you clean-up the imported text based on the Slovak typographic rules.
 * removing double spaces
 * removing spaces before full stops, commas, colons, semicolons, question marks or exclamation marks
 * removing extra spaces or tabs at the beginning and at the end of the paragraphs
 * removing blank lines between paragraphs

The script

 * 1) !/usr/bin/env python
 * 2) -*- coding: utf-8 -*-

""" Author: Richard Sitányi (cdbox@zilina.net) File: clean-up.py (Clean-up the imported text based on the Slovak typographic rules.) Version: 1.0 Date: 06/02/2013

LICENSE: This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place – Suite 330, Boston, MA 02111-1307, USA.

DESCRIPTION: A common problem when receiving text from other people is that it is often not as consistently formatted as you might like. Some common things that need to be fixed are: Fixing these items manually can become very time consuming. This script will help you clean-up the imported text based on the Slovak typographic rules.
 * removing double spaces
 * removing spaces before full stops, commas, colons, semicolons, question marks or exclamation marks
 * removing extra spaces or tabs at the beginning and at the end of the paragraphs
 * removing blank lines between paragraphs

USAGE: Create new document, insert text frame, import text from file, run the script and enjoy ;-)

"""

import sys import re

try: import scribus except ImportError,err: print 'This Python script is written for the Scribus scripting interface. It can only be run from within Scribus.' sys.exit(1)

if not scribus.haveDoc: scribus.messageBox('Warning', 'You should open a document.', scribus.ICON_WARNING, scribus.BUTTON_OK) sys.exit(1) if scribus.selectionCount == 0: scribus.messageBox('Warning', 'You should select a text frame.', scribus.ICON_WARNING, scribus.BUTTON_OK) sys.exit(1) if scribus.selectionCount > 1: scribus.messageBox('Warning', 'You should select one text frame.', scribus.ICON_WARNING, scribus.BUTTON_OK) sys.exit(1)

replacements = (	# tab and non breaking space to single space	(r'['+u'\u0009'+u'\u00a0'+r']+',u'\u0020'),

# space+hyphen+space to space+en dash+space (u'\u0020'+u'\u002d'+r'+'+u'\u0020',u'\u0020'+u'\u2013'+u'\u0020'),

# double space to single space (u'\u0020'+u'\u0020'+r'+',u'\u0020'),

# space+solidus or solidus+space to solidus (u'\u0020'+r'\/|\/'+u'\u0020',u'\u002f'),

# space+hyphen or hyphen+space to hyphen (u'\u0020'+r'\-|\-'+u'\u0020',u'\u002d'),

# ampersand to space+ampersand; if there is not a space before the ampersand (r'(?<!'+u'\u0020'+r')'+u'\u0026',u'\u0020'+u'\u0026'),

# ampersand to ampersand+space; if there is not a space after the ampersand (u'\u0026'+r'(?!'+u'\u0020'+r')',u'\u0026'+u'\u0020'),

# percent sign to space+percent sign; if there is not a space before the percent sign # WARNING: In Slovak the percent sign is spaced if the number is used as a noun, while no space is inserted if the number is used as an adjective (e.g. “a 50% increase”). # Therefore consider carefully the use of the replacement. # (r'(?<!'+u'\u0020'+r')\%',u'\u0020'+u'\u0025'),

# percent sign to percent sign+space; if there are not a space, full stop, comma, semicolon, colon, question mark, exclamation mark, slovak right single quotation mark, # slovak right double quotation mark, new line (SHIFT+ENTER) or carriage return after the percent sign (r'\%(?!'+u'\u0020'+r'|\.|\,|\;|\:|\?|\!'+u'\u2018'+r'|'+u'\u201c'+r'|'+u'\u001c'+r'|'+u'\u000d'+r')',u'\u0025'+u'\u0020'),

# left parenthesis to space+left parenthesis; if there is not a space before the left parenthesis (r'(?<!'+u'\u0020'+r')\(',u'\u0020'+u'\u0028'),

# right parenthesis to right parenthesis+space; if there are not a space, full stop, comma, semicolon, colon, question mark, exclamation mark, # slovak right single quotation mark, slovak right double quotation mark, new line (SHIFT+ENTER) or carriage return after the right parenthesis (r'\)(?!'+u'\u0020'+r'|\.|\,|\;|\:|\?|\!'+u'\u2018'+r'|'+u'\u201c'+r'|'+u'\u001c'+r'|'+u'\u000d'+r')',u'\u0029'+u'\u0020'),

# left parenthesis+space to left parenthesis (r'\('+u'\u0020',u'\u0028'),

# space+right parenthesis to right parenthesis (u'\u0020'+r'\)',u'\u0029'),

# three full stops to ellipsis (r'\.\.\.+',u'\u2026'),

# space+ellipsis to ellipsis (u'\u0020'+u'\u2026',u'\u2026'),

# double ellipsis to ellipsis (u'\u2026'+u'\u2026'+r'+',u'\u2026'),

# ellipsis to ellipsis+space; if there are not a space, question mark, exclamation mark, slovak right single quotation mark, # slovak right double quotation mark, new line (SHIFT+ENTER), carriage return or end of text after the ellipsis (u'\u2026'+r'(?!'+u'\u0020'+r'|\?|\!|'+u'\u2018'+r'|'+u'\u201c'+r'|'+u'\u001c'+r'|'+u'\u000d'+r'|'+r''+r')',u'\u2026'+u'\u0020'),

# space+full stop to full stop (u'\u0020'+r'\.',u'\u002e'),

# double full stop to full stop (r'\.\.+',u'\u002e'),

# full stop to full stop+space; if there are not a space, comma, semicolon, colon, slovak right single quotation mark, # slovak right double quotation mark, new line (SHIFT+ENTER), carriage return or end of text after the full stop (r'\.(?!'+u'\u0020'+r'|\,|\;|\:|'+u'\u2018'+r'|'+u'\u201c'+r'|'+u'\u001c'+r'|'+u'\u000d'+r'|'+r''+r')',u'\u002e'+u'\u0020'),

# space+comma to comma (u'\u0020'+r'\,',u'\u002c'),

# double comma to comma (r'\,\,+',u'\u002c'),

# comma to coma+space; if there are not a space, number, slovak right single quotation mark, slovak right double quotation mark, # new line (SHIFT+ENTER) or carriage return after the comma (r'\,(?!'+u'\u0020'+r'|[0-9]|'+u'\u2018'+r'|'+u'\u201c'+r'|'+u'\u001c'+r'|'+u'\u000d'+r')',u'\u002c'+u'\u0020'),

# space+colon to colon (u'\u0020'+r'\:',u'\u003a'),

# double colon to colon (r'\:\:+',u'\u003a'),

# colon to colon+space; if there are not a space, new line (SHIFT+ENTER) or carriage return after the colon (r'\:(?!'+u'\u0020'+r'|'+u'\u001c'+r'|'+u'\u000d'+r')',u'\u003a'+u'\u0020'),

# space+semicolon to semicolon (u'\u0020'+r'\;',u'\u003b'),

# double semicolon to semicolon (r'\;\;+',u'\u003b'),

# semicolon to semicolon+space; if there are not a space, new line (SHIFT+ENTER) or carriage return after the semicolon (r'\;(?!'+u'\u0020'+r'|'+u'\u001c'+r'|'+u'\u000d'+r')',u'\u003b'+u'\u0020'),

# space+question mark to question mark (u'\u0020'+r'\?',u'\u003f'),

# question mark to question mark+space; if there are not a space, question mark, exclamation mark, slovak right single quotation mark, # slovak right double quotation mark, new line (SHIFT+ENTER), carriage return or end of text after the question mark (r'\?'+r'(?!'+u'\u0020'+r'|\?|\!|'+u'\u2018'+r'|'+u'\u201c'+r'|'+u'\u001c'+r'|'+u'\u000d'+r'|'+r''+r')',u'\u2026'+u'\u0020'),

# space+exclamation mark to exclamation mark (u'\u0020'+r'\!',u'\u0021'),

# exclamation mark to exclamation mark+space; if there are not a space, question mark, exclamation mark, slovak right single quotation mark, # slovak right double quotation mark, new line (SHIFT+ENTER), carriage return or end of text after the exclamation mark (r'\!'+r'(?!'+u'\u0020'+r'|\?|\!|'+u'\u2018'+r'|'+u'\u201c'+r'|'+u'\u001c'+r'|'+u'\u000d'+r'|'+r''+r')',u'\u2026'+u'\u0020'),

# slovak left single quotation mark+space to slovak left single quotation mark (u'\u201a'+u'\u0020',u'\u201a'),

# slovak left double quotation mark+space to slovak left double quotation mark (u'\u201e'+u'\u0020',u'\u201e'),

# space+slovak right single quotation mark to slovak right single quotation mark (u'\u0020'+u'\u2018',u'\u2018'),

# space+slovak right double quotation mark to slovak right double quotation mark (u'\u0020'+u'\u201c',u'\u201c'),

# slovak left single quotation mark to slovak left single quotation mark+space; if there is not a space before the slovak left single quotation mark (r'(?<!'+u'\u0020'+r')\(',u'\u0020'+u'\u201a'),

# slovak left double quotation mark to slovak left double quotation mark+space; if there is not a space before the slovak left double quotation mark (r'(?<!'+u'\u0020'+r')\(',u'\u0020'+u'\u201e'),

# slovak right single quotation mark to slovak right single quotation mark+space; if there are not a space, full stop, comma, semicolon, colon, question mark, # exclamation mark, slovak right double quotation mark, new line (SHIFT+ENTER) or carriage return after the slovak right single quotation mark (u'\u2018'+r'(?!'+u'\u0020'+r'|\.|\,|\;|\:|\?|\!|'+u'\u201c'+r'|'+u'\u001c'+r'|'+u'\u000d'+r')',u'\u2018'+u'\u0020'),

# slovak right double quotation mark to slovak right double quotation mark+space; if there are not a space, full stop, comma, semicolon, colon, question mark, # exclamation mark, new line (SHIFT+ENTER), carriage return or end of text after the slovak right double quotation mark (u'\u201c'+r'(?!'+u'\u0020'+r'|\.|\,|\;|\:|\?|\!|'+u'\u001c'+r'|'+u'\u000d'+r'|'+r''+r')',u'\u201c'+u'\u0020'),

# new line (SHIFT+ENTER) to carriage return (u'\u001c'+r'+',u'\u000d'),

# space+carriage return to carriage return (u'\u0020'+u'\u000d'+r'+',u'\u000d'),

# carriage return+space to carriage return (u'\u000d'+u'\u0020'+r'+',u'\u000d'),

# double carriage return to carriage return (u'\u000d'+u'\u000d'+r'+',u'\u000d'),

# remove extra spaces from start and end of a text (r'^'+u'\u0020'+r'|'+u'\u0020'+r'$','') )

d = scribus.getSelectedObject

if scribus.getObjectType(d) != 'TextFrame': scribus.messageBox('Warning', 'You should select a text frame.', scribus.ICON_WARNING, scribus.BUTTON_OK) sys.exit(1) else: for item in replacements: content = unicode(scribus.getAllText(d)) p = re.compile(item[0]) r = re.finditer(p, content) for i in reversed(tuple(r)): count = i.end-i.start scribus.selectText(i.start, count, d)			scribus.deleteText(d) scribus.insertText(item[1], i.start, d)	scribus.setRedraw(True) scribus.docChanged(True) scribus.messageBox('Info', 'Script finished successfully.', scribus.ICON_INFORMATION, scribus.BUTTON_OK)