Project Dana

Component parsing.Markdown by barry
data StyleFrame {
	const byte BOLD = 0x1
	const byte ITALIC = 0x2
	const byte CODE = 0x4
	
	byte flag
	
	int startPos
	}

data LinkSection {
	char linkContent[] //in []
	char url[] //in ()
	char title[] //in (), after space
	int postIndex
	}

data StackEl {
	MDElement mde
	Stack subStack
	}

component provides Markdown requires data.StringUtil stringUtil, data.adt.List, data.adt.Stack, io.Output out, data.IntUtil iu {
	
	String[] toLines(char content[])
		{
		//this function is similar to StringUtil.explode(), except that it preserves "blank lines" that appear between two or more line breaks
		List lines = new List()
		
		int i
		int lastN = 0
		for (i = 0; i < content.arrayLength; i++)
			{
			if (content[i] == "\n")
				{
				int end
				int start = lastN
				
				if (i == 0)
					end = 0
					else if (content[i-1] == "\r")
					end = i-2
					else
					end = i-1
				
				if (content[start] == "\n")
					start ++
				
				if (lastN == end)
					{
					if (content[lastN] != "\n")
						{
						lines.add(new String(content[lastN]))
						//out.println("ln: '$(content[lastN])'")
						}
						else
						{
						lines.add(new String())
						//out.println("ln: ''")
						}
					}
					else
					{
					int len = (end - start)+1
					char kn[] = new char[len]
					copyCharsFD(kn, 0, content, start, len)
					lines.add(new String(kn))
					//out.println("ln: '$kn'")
					}
				
				lastN = i
				}
			}
		
		//grab the final chunk, if any...
		if (lastN != i-1)
			{
			int end = i-1
			int len = (end - lastN)+1
			char kn[] = new char[len]
			copyCharsFD(kn, 0, content, lastN, len)
			lines.add(new String(kn))
			}
		
		return lines.getContents(typeof(String))
		}
	
	void copyCharsFD(char dest[], int start, char source[], int srcStart, int srcLen)
		{
		for (int i = 0; i < srcLen; i++)
			{
			dest[start+i] = source[srcStart + i]
			}
		}
	
	MDElement processHeaderLine(char str[])
		{
		if (stringUtil.startsWith(str, "######"))
			{
			int startLen = 6
			char kn[] = stringUtil.subString(str, startLen, str.arrayLength - startLen).ltrim()
			
			return new MDElement(MDElement.HEADING, 6, kn)
			}
			else if (stringUtil.startsWith(str, "#####"))
			{
			int startLen = 5
			char kn[] = stringUtil.subString(str, startLen, str.arrayLength - startLen).ltrim()
			
			return new MDElement(MDElement.HEADING, 5, kn)
			}
			else if (stringUtil.startsWith(str, "####"))
			{
			int startLen = 4
			char kn[] = stringUtil.subString(str, startLen, str.arrayLength - startLen).ltrim()
			
			return new MDElement(MDElement.HEADING, 4, kn)
			}
			else if (stringUtil.startsWith(str, "###"))
			{
			int startLen = 3
			char kn[] = stringUtil.subString(str, startLen, str.arrayLength - startLen).ltrim()
			
			return new MDElement(MDElement.HEADING, 3, kn)
			}
			else if (stringUtil.startsWith(str, "##"))
			{
			int startLen = 2
			char kn[] = stringUtil.subString(str, startLen, str.arrayLength - startLen).ltrim()
			
			return new MDElement(MDElement.HEADING, 2, kn)
			}
			else if (stringUtil.startsWith(str, "#"))
			{
			int startLen = 1
			char kn[] = stringUtil.subString(str, startLen, str.arrayLength - startLen).ltrim()
			
			return new MDElement(MDElement.HEADING, 1, kn)
			}
			
		return null
		}
	
	char[] extractArray(char src[], int start, int end)
		{
		char result[] = new char[end-start]
		
		int j = 0
		for (int i = start; i < end; i ++)
			{
			result[j] = src[i]
			j ++
			}
		
		return result
		}
	
	LinkSection getLink(char line[], int startIndex)
		{
		//this is bracket-balance checks, plus in-string
		if (line[startIndex] == "[")
			{
			int sqBalance = 1
			int rbBalance = 0
			bool inString = false
			
			int mode = 0
			
			LinkSection ls = new LinkSection()
			
			int lastStart = startIndex + 1
			
			for (int i = startIndex + 1; i < line.arrayLength; i++)
				{
				if (!inString)
					{
					if (line[i] == "[")
						sqBalance ++
						else if (line[i] == "]")
						sqBalance --
						else if (line[i] == "(")
						rbBalance ++
						else if (line[i] == ")")
						rbBalance --
						else if (line[i] == "\"")
						inString = true
					
					//end of sq section
					if (mode == 0 && sqBalance == 0)
						{
						ls.linkContent = extractArray(line, lastStart, i)
						mode = 1
						}
					
					//start of round bracket section
					if (mode == 1 && line[i] == "(" && rbBalance == 1)
						{
						lastStart = i + 1
						}
					
					//end of first param of the round bracket section
					if (mode == 1 && rbBalance == 1 && line[i] == " ")
						{
						ls.url = extractArray(line, lastStart, i)
						
						mode = 2
						
						lastStart = i + 1
						}
					
					//end of round bracket section, without the optional 2nd param
					if (mode == 1 && rbBalance == 0 && line[i] == ")")
						{
						ls.url = extractArray(line, lastStart, i)
						ls.postIndex = i + 1
						
						return ls
						}
					
					//end of round bracket section, with the optional 2nd param
					if (mode == 2 && rbBalance == 0 && line[i] == ")")
						{
						ls.title = extractArray(line, lastStart+1, i-1)
						ls.postIndex = i + 1
						
						return ls
						}
					}
					else if (line[i] == "\"")
					{
					inString = false
					}
				}
			}
		
		return null
		}
	
	//the key here is that we need a flat list as output, no nesting -- and the text styles are just *flags*
	// - we need a stack to track the current set of text style flags, and whenever we get to a new style *start* marker we complete the current MDElement, with the flags it has, and push the new flags on to the stack
	// - at an end marker, we again end the current MDElement, pop the stack, and start the next MDElement with the style flags we used to have...
	//TODO:
	// - escape chars
	// - double back-ticks
	// - direct-url
	MDElement[] parseLine(char line[])
		{
		MDElement result[]
		
		Stack statusStack = new Stack()
		
		statusStack.push(new StyleFrame())
		
		StyleFrame top = statusStack.peek()
		
		for (int i = 0; i < line.arrayLength; i++)
			{
			if (line[i] == "`")
				{
				//code
				// - disables all other line style elements until code off
				
				//TODO: double-back-tick, which disables single-back-tick until code off...
				
				if ((top.flag & StyleFrame.CODE) == StyleFrame.CODE)
					{
					//end
					MDElement newEl = new MDElement(MDElement.PLAIN, top.flag, extractArray(line, top.startPos, i))
					result = new MDElement[](result, newEl)
					
					statusStack.pop()
					
					top = statusStack.peek()
					
					top.startPos = i + 1
					}
					else
					{
					//start
					MDElement newEl = new MDElement(MDElement.PLAIN, top.flag, extractArray(line, top.startPos, i))
					result = new MDElement[](result, newEl)
					
					statusStack.push(new StyleFrame(StyleFrame.CODE | top.flag, i + 1))
					
					top = statusStack.peek()
					}
				}
				else if ((top.flag & StyleFrame.CODE) != StyleFrame.CODE)
				{
				if (line[i] == "\\" && i+1 < line.arrayLength)
					{
					//escape character (ignore what's next, but remove the "\" itself)
					// - we end the current block here, then start a new one after the "\"
					MDElement newEl = new MDElement(MDElement.PLAIN, top.flag, extractArray(line, top.startPos, i))
					result = new MDElement[](result, newEl)

					// - create a new block which starts with the escaped character
					statusStack.push(new StyleFrame(top.flag, i + 1))
					
					top = statusStack.peek()

					//now we get the parser to ignore the escaped character by skipping over that character
					i ++
					}
					else if (line[i] == "*")
					{
					//emphasis -- check ahead to see how many *'s we have
					// (decide which flags this entails; peek at the stack-top and create a new flag set which combines the new and existing style flags)
					// - if it's the same flag set as the stack top, then it's just the end of that section...?
					
					byte flag = 0
					int postPos = 0
					int ori = i
					if (i+1 < line.arrayLength && line[i+1] == "*")
						{
						if (i+2 < line.arrayLength && line[i+2] == "*")
							{
							flag = StyleFrame.BOLD | StyleFrame.ITALIC
							postPos = 3
							i ++
							i ++
							}
							else
							{
							flag = StyleFrame.BOLD
							postPos = 2
							i ++
							}
						}
						else
						{
						flag = StyleFrame.ITALIC
						postPos = 1
						}
					
					top = statusStack.peek()
					
					if (top.flag == flag)
						{
						//end of current section
						MDElement newEl = new MDElement(MDElement.PLAIN, top.flag, extractArray(line, top.startPos, ori))
						result = new MDElement[](result, newEl)
						
						statusStack.pop()
						
						top = statusStack.peek()
						
						top.startPos = ori + postPos
						}
						else
						{
						//start of new section; we still end the current section before starting the new one
						MDElement newEl = new MDElement(MDElement.PLAIN, top.flag, extractArray(line, top.startPos, ori))
						result = new MDElement[](result, newEl)
						
						statusStack.push(new StyleFrame(flag | top.flag, ori + postPos))
						
						top = statusStack.peek()
						}
					}
					else if (line[i] == "[")
					{
					//start of a link
					// - this looks like [](), where the () can internally have two items: the link URL and the "title", separated by a space
					// - the thing in [] can be processed recursively, and the parse here also needs to be aware of nested []'s inside it, so we count...
					
					// - here we need to scan forward to consume the entire statement, and check it's syntactically correct
					// - if it isn't well-formed we can just ignore this syntax and continue
					
					LinkSection ls = null
					
					if ((ls = getLink(line, i)) != null)
						{
						//end of current section
						MDElement newEl = new MDElement(MDElement.PLAIN, top.flag, extractArray(line, top.startPos, i))
						result = new MDElement[](result, newEl)
						
						//adjust indices for where we resume, and add the link record
						i = ls.postIndex
						top.startPos = i
						
						newEl = new MDElement(MDElement.LINK, top.flag, ls.url, ls.title, parseLine(ls.linkContent))
						result = new MDElement[](result, newEl)
						}
					}
					else if (line[i] == "!")
					{
					//start of an image
					// - ![]()
					
					if (i + 1 < line.arrayLength && line[i+1] == "[")
						{
						LinkSection ls = null
						
						if ((ls = getLink(line, i+1)) != null)
							{
							//end of current section
							MDElement newEl = new MDElement(MDElement.PLAIN, top.flag, extractArray(line, top.startPos, i))
							result = new MDElement[](result, newEl)
							
							//adjust indices for where we resume, and add the image record
							i = ls.postIndex
							top.startPos = i
							
							newEl = new MDElement(MDElement.IMAGE, top.flag, ls.url, ls.title)
							result = new MDElement[](result, newEl)
							}
						}
					}
					//TODO: 
				}
			}
		
		top = statusStack.pop()
		
		//this is either plain text, or an unclosed style, which we also consider to be plain text...
		if (top != null && top.startPos < line.arrayLength)
			{
			MDElement newEl = new MDElement(MDElement.PLAIN, 0, extractArray(line, top.startPos, line.arrayLength))
			result = new MDElement[](result, newEl)
			}
		
		return result
		}
	
	bool emptyLine(char line[])
		{
		if (line.arrayLength == 0)
			return true
		
		return false
		}
	
	bool isNumber(char s)
		{
		return s == "0" || s == "1" || s == "2" || s == "3" || s == "4" || s == "5" || s == "6" || s == "7" || s == "8" || s == "9"
		}
	
	bool orderedListStart(char str[])
		{
		for (int i = 0; i < str.arrayLength; i ++)
			{
			if (i > 0 && str[i] == ".") return true
			
			if (!isNumber(str[i])) return false
			}
		
		return false
		}
	
	void processElements(Stack stack, char str[])
		{
		StackEl topEl = stack.peek()
		MDElement top = topEl.mde
		MDElement sub = null
		
		if (top.type == MDElement.CODE_BLOCK && top.extType == MDElement.CODE_FENCE)
			{
			if (stringUtil.startsWith(str, "```"))
				{
				stack.pop()
				}
				else
				{
				top.content = new char[](top.content, "$str\n")
				}
			}
			else if (stringUtil.startsWith(str, "    "))
			{
			int startLen = 4
			char kn[] = stringUtil.subString(str, startLen, str.arrayLength - startLen)
			
			//TODO: if we're in a list, we can now have the start of another element type (for processElements)...
			
			if (top.type == MDElement.LIST_ORDERED || top.type == MDElement.LIST_UNORDERED)
				{
				// - so we would get the last child element of top, and add children to that...?
				}
				else if (top.type == MDElement.CODE_BLOCK)
				{
				//this is just a continuation
				top.content = new char[](top.content, "$kn\n")
				}
				else
				{
				//start of a new code block section
				MDElement el = new MDElement(MDElement.CODE_BLOCK)
				
				top.children = new MDElement[](top.children, el)
				stack.push(new StackEl(el))
				
				el.content = "$kn\n"
				}
			}
			else if (stringUtil.startsWith(str, "\t"))
			{
			//TODO: as above...
			}
			else if ((sub = processHeaderLine(str)) != null)
			{
			top.children = new MDElement[](top.children, sub)
			}
			else if (emptyLine(str))
			{
			if (top.type != MDElement.PLAIN)
				{
				stack.pop()
				topEl = stack.peek()
				top = topEl.mde
				}
			
			//TODO: decide on whether to start a new paragraph
			}
			else if (stringUtil.startsWith(str, ">"))
			{
			char kn[] = stringUtil.subString(str, 1, str.arrayLength - 1).ltrim()
			
			//here we need to create a nested status-tracking stack, so that the thing inside the block quote gets a fresh context
			
			Stack relative = null
			
			if (top.type != MDElement.QUOTE_BLOCK)
				{
				sub = new MDElement(MDElement.QUOTE_BLOCK)
				sub.children = new MDElement(MDElement.PLAIN)
				top.children = new MDElement[](top.children, sub)
				relative = new Stack()
				stack.push(new StackEl(sub, relative))
				relative.push(new StackEl(sub.children[0]))
				}
				else
				{
				relative = topEl.subStack
				//check if a nested block quote just ended...
				//TODO: I don't think this works if muliple nested levels end at the same line...
				StackEl relativeTop = relative.peek()
				if (kn.arrayLength > 0 && kn[0] != ">" && relativeTop.mde.type == MDElement.QUOTE_BLOCK) relative.pop()
				}
			
			processElements(relative, kn)
			}
			else if (stringUtil.startsWith(str, "- "))
			{
			char kn[] = stringUtil.subString(str, 1, str.arrayLength - 1).ltrim()
			
			if (top.type != MDElement.LIST_UNORDERED)
				{
				sub = new MDElement(MDElement.LIST_UNORDERED)
				top.children = new MDElement[](top.children, sub)
				stack.push(new StackEl(sub))
				top = sub
				}
			
			top.children = new MDElement[](top.children, new MDElement(MDElement.LIST_ITEM, 0, null, null, parseLine(kn)))
			}
			else if (orderedListStart(str))
			{
			char kn[] = stringUtil.subString(str, 2, str.arrayLength - 2).ltrim()
			
			if (top.type != MDElement.LIST_ORDERED)
				{
				sub = new MDElement(MDElement.LIST_ORDERED)
				top.children = new MDElement[](top.children, sub)
				stack.push(new StackEl(sub))
				top = sub
				}
			
			top.children = new MDElement[](top.children, new MDElement(MDElement.LIST_ITEM, 0, null, null, parseLine(kn)))
			}
			else if (stringUtil.startsWith(str, "```"))
			{
			if (top.type != MDElement.CODE_BLOCK)
				{
				char language[] = extractArray(str, 3, str.arrayLength)
				
				sub = new MDElement(MDElement.CODE_BLOCK, MDElement.CODE_FENCE, extInfo = language)
				top.children = new MDElement[](top.children, sub)
				stack.push(new StackEl(sub))
				}
			}
			else if (stringUtil.startsWith(str, "***") || stringUtil.startsWith(str, "---") || stringUtil.startsWith(str, "___"))
			{
			top.children = new MDElement[](top.children, new MDElement(MDElement.HLINE))
			}
			else
			{
			if (top.type != MDElement.PARAGRAPH)
				{
				sub = new MDElement(MDElement.PARAGRAPH)
				top.children = new MDElement[](top.children, sub)
				stack.push(new StackEl(sub))
				top = sub
				}
			
			top.children = new MDElement[](top.children, parseLine(str))
			}
		}
	
	MDElement Markdown:parse(char doc[])
		{
		MDElement root = new MDElement(MDElement.PLAIN)
		
		String lines[] = toLines(doc)
		
		int lineNumber = 1
		
		Stack stack = new Stack()
		stack.push(new StackEl(root))
		
		for (int i = 0; i < lines.arrayLength; i++)
			{
			processElements(stack, lines[i].string)
			
			lineNumber ++
			}
		
		return root
		}
	
	}
Revision history
To propose a new revision to this entity, use dana source put -uc your/new/version.dn -n parsing.Markdown -m "reason for update" -u yourUsername
Version 1 (this version) by barry
Notes for this version: Standard Library Initialisation