Formatter written in Python

Style

During a conversation the other week I misunderstood what someone said about separating style and content. For a brief moment I thought what was being offered was a system in which not even the markup used to later on determine style was mixed with the actual text content. It then transpired that this was not how they did things but the idea of that was so attractive I figured I would give it a whirl myself.

To demonstrate, this is how raw HTML looks:

<p>This is a paragraph with some <i>italic</i> and some <b>bold</b> text.</p>

I don't like that. But I'm not much for using Wysiwyg(What you see is what you get) editors either. When I say I want to separate style and content what I actually mean is that I want to separate style from myself. I don't like graphics, design, colors and such jazz. I write text!

But I do format my text a little. I indent top level headers with two tabs and ordinary headers with just one tab. And I write underscore just before and after phrases I want to emphasize. Also I divide text into paragraphs with an empty line between them.

Here's how this page looks before the conversion

Conversion

With this in mind, I decided to write a conversion program to turn my text-only minimalist formatting into HTML code. There is a Perl program called txt2html that does this already but doesn't use my formatting rules. Also, it's written in Perl... In my folly I thought I'd write my program in Emacs Lisp to practise that language. Thankfully I came to my senses and wrote it in Python instead.

import sys
import string

# Enum emulations
class SegType:
    Unknown, H1, H2, P, CODE = range(5)

# Does it show that I'm a Java programmer by training?
class Segment():
    
    def __init__(self):
        self.type = SegType.Unknown
        self.content = ""

    def add_text(self, content):
        initial_size = len(self.content)
        if(self.content):
            self.content = self.content + content
        else:
            self.content = content
        assert len(self.content) >= initial_size

    def get_text(self):
        return self.content

    def get_formatted_text(self):
        formatted = self.content.replace("<","&lt;")
        formatted = formatted.replace(">","&gt;")
        if(self.type == SegType.CODE):
            final = "<CODE><PRE>\n" + formatted + "\n</PRE></CODE>"
            return final
        else:
            formatted = inline_format(formatted)
            if(self.type == SegType.Unknown):
                final = "<div>\n" + formatted + "\n</div>"
            elif(self.type == SegType.H1):
                final = "<H1>" + formatted + "</H1>"
            elif(self.type == SegType.H2):
                final = "<H2>" + formatted + "</H2>"
            elif(self.type == SegType.P):
                final = "<P>\n" + formatted + "\n</P>"
            return final

    def set_type(self, typ):
        self.type = typ

    def get_type(self):
        return self.type

def inline_format(text):
    translated = ""
    i = 0
    has_started_i = False
    # Italics handling. First _ is <i>, the next </i> etc.
    while(i < (len(text))):
        if(text[i] == '_'):
            if(has_started_i): # Time to close this italicized section
                translated += "</i>"
                has_started_i = False
            else:
                translated += "<i>"
                has_started_i = True
        else:
            translated += text[i]
        i += 1
    return translated


def count_initial_tabs(line):
    if(len(line) == 0):
        return 0
    tabs = 0
    spaces = 0
    i = 0
    # Counting whitespace for line
    while(line[i] in string.whitespace and i < (len(line) - 1)):
        if(line[i] == ' '):
            spaces += 1
        elif(line[i] == '\t'):
            tabs += 1
        i += 1
    # Four spaces is equivalent to one tab, so I declare
    return tabs + spaces/4

if(len(sys.argv) < 2):
    print "Usage: process.py <txtFile>"
    quit(0)

target_name = sys.argv[1]
src = open(target_name)
lines = src.readlines()

i = 0
newlines = 0
para = False
code = False
segment = Segment()
segments = []

for line in lines:
    # Empty lines separate sections
    if(line == "\n" or len(line) == 0):
        if(para == True):
            para = False
            segments.append(segment)
        newlines += 1
    elif(line.startswith("//")): # Special code designator. Unfortunte choice really...
        if(code):
            code = False
            segments.append(segment)
        else:
            segment = Segment()
            segment.set_type(SegType.CODE)
            code = True
    else: # Apparently this line contains something
        if(newlines > 0): # Have we been seeing some set of newlines?
            tab_level = count_initial_tabs(line)
            if(code):
                segment.add_text(line)
            elif(tab_level == 1): # Should factor out some of this code
                segment = Segment()
                segment.set_type(SegType.H2)
                segment.add_text(line.strip())
                segments.append(segment)
            elif(tab_level == 2):
                segment = Segment()
                segment.set_type(SegType.H1)
                segment.add_text(line.strip())
                segments.append(segment)
            else:
                para = True   # Then this is a new paragraph we've encountered
                segment = Segment()
                segment.set_type(SegType.P)
                segment.add_text(line.strip())
            newlines = 0
        else: # Previous line wasn't empty, so we're filling up a paragraph
            segment.add_text(line)

# If the file ends without an empty line, this will be needed.
if(code or para):
    segments.append(segment)
        
filename = target_name.split(".")
output_name = filename[0] + ".html"
output = open(output_name, "w")

output.write("""<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>""")
for seg in segments:
    output.write(seg.get_formatted_text() + "\n\n")
output.write("</html>")
output.close()