Project Dana

Component compress.Archive:targz by barry
// https://www.gnu.org/software/tar/manual/html_node/Standard.html

uses time.DateTime

const int CHUNK_SIZE = 5120

const int SP_FAIL = 0
const int SP_CONTINUE = 1
const int SP_END = 2

data GZHeader {
	byte ID1
	byte ID2
	byte CM
const byte FLG_FTEXT = 0x1
const byte FLG_FHCRC = 0x2
const byte FLG_FEXTRA = 0x4
const byte FLG_FNAME = 0x8
const byte FLG_FCOMMENT = 0x10
	byte FLG
	int4 mtime
	byte XFL
	byte OS
	}

data Int4 {
	int4 val
	}

data Int2 {
	int2 val
	}

const byte REGTYPE = "0"          /* regular file */
const byte AREGTYPE = 0           /* regular file */
const byte LNKTYPE = "1"          /* link */
const byte SYMTYPE = "2"          /* reserved */
const byte CHRTYPE = "3"          /* character special */
const byte BLKTYPE = "4"          /* block special */
const byte DIRTYPE = "5"          /* directory */
const byte FIFOTYPE = "6"         /* FIFO special */
const byte CONTTYPE = "7"         /* reserved */
const byte XHDTYPE = "x"          /* Extended header referring to the next file in the archive */
const byte XGLTYPE = "g"          /* Global extended header */

//500-byte block
data TarHeader {
	byte fileName[100]
	byte fileMode[8]
	byte ownerID[8]
	byte groupID[8]
	byte fileSize[12]
	byte lastModified[12]
	byte checksum[8]
	int1 fileType
	byte linkedFileName[100] //last field of original TAR format
	byte ustar[6]
	byte ustarVersion[2]
	byte ownerUsername[32]
	byte ownerGroupname[32]
	int8 devNoMajor
	int8 devNoMinor
	byte fileNamePrefix[155]
	}

data FileIndex {
	TarHeader header
	ArchiveFile record
	int offset
	char path[]
	}

data FileTree {
	char name[]
	bool dir
	ArchiveFile record
	FileTree children[]
	}

component provides Archive:targz requires io.Output out, data.IntUtil iu, data.StringUtil stringUtil, time.TimeUnix timeUnix, data.query.Search search, compress.algorithm.StreamCompression:deflate, data.checksum.CRC32 crc32, data.StreamBuffer {
	
	File ifd
	FileIndex archiveIndex[]
	ArchiveFile publicIndex[]
	FileTree fileTree
	
	int compressedDataOffset
	
	void reverseEndian(byte num[])
		{
		int j = num.arrayLength - 1
		for (int i = 0; i < num.arrayLength / 2; i++)
			{
			byte tmp = num[j]
			num[j] = num[i]
			num[i] = tmp
			j --
			}
		}
	
	void copyBytes(byte dest[], int destOffset, byte src[], int srcOffset, int len)
		{
		for (int i = 0; i < len; i++)
			{
			dest[destOffset+i] = src[srcOffset+i]
			}
		}
	
	//return a Dana char[] from a null-terminated char[]
	char[] getNTString(char nt[])
		{
		char c = 0
		int ndx = 0
		if ((ndx = nt.find(c)) != StringUtil.NOT_FOUND)
			{
			return nt.subString(c, ndx)
			}
			else
			{
			return nt
			}
		}
	
	int8 i8FromOctal(byte ar[])
		{
		int8 res = 0
		
		int n = 0
		for (int i = ar.arrayLength-2; i != INT_MAX; i--)
			{
			if (ar[i] != 0)
				{
				int8 rq = ar[i] - 48
				res = res | (rq << (n*3))
				}
			n ++
			}
		
		return res
		}
	
	int4 i4FromOctal(byte ar[], int topIndex)
		{
		int4 res = 0
		
		int n = 0
		for (int i = topIndex; i != INT_MAX; i--)
			{
			if (ar[i] != 0)
				{
				char cn = ar[i]
				int4 rq = ar[i] - 48
				
				res = res | (rq << (n*3))
				}
			n ++
			}
		
		return res
		}
	
	//there appear to be two different ways of storing a tar checksum; we accept both
	int4 i4FromChecksum(byte ar[])
		{
		if (ar[ar.arrayLength-1] == " ")
			{
			//standard checksum, terminated by a nul then a space
			int4 res = 0
			
			int n = 0
			for (int i = ar.arrayLength-3; i != INT_MAX; i--)
				{
				if (ar[i] != 0)
					{
					char cn = ar[i]
					int4 rq = ar[i] - 48
					
					res = res | (rq << (n*3))
					}
				n ++
				}
			
			return res
			}
			else
			{
			//alternative checksum, which is terminated only by a nul
			int4 res = 0
			
			int n = 0
			for (int i = ar.arrayLength-2; i != INT_MAX; i--)
				{
				if (ar[i] != 0)
					{
					char cn = ar[i]
					int4 rq = ar[i] - 48
					
					res = res | (rq << (n*3))
					}
				n ++
				}
			
			return res
			}
		}
	
	TarHeader readFileHeader(File fd, int offset)
		{
		TarHeader record = new TarHeader()
		
		byte srd[] = dana.serial(record)
		
		fd.setPos(offset)
		
		srd =[] fd.read(srd.arrayLength)
		
		return record
		}
	
	TarHeader readArrayHeader(byte array[], int offset)
		{
		TarHeader record = new TarHeader()
		
		byte srd[] = dana.serial(record)
		
		copyBytes(srd, 0, array, offset, srd.arrayLength)
		
		return record
		}
	
	bool checksumOK(TarHeader record)
		{
		int4 chk = i4FromChecksum(record.checksum)
		
		byte srd[] = dana.serial(record)
		
		int4 ver = 0
		
		for (int i = 0; i < srd.arrayLength; i++)
			{
			ver += srd[i]
			}
		
		for (int i = 0; i < record.checksum.arrayLength; i++)
			{
			ver -= record.checksum[i]
			ver += 32
			}
		
		return chk == ver
		}
	
	bool blankHeader(TarHeader record)
		{
		byte srd[] = dana.serial(record)
		
		for (int i = 0; i < srd.arrayLength; i++)
			{
			if (srd[i] != 0) return false
			}
		
		return true
		}
	
	void addToTree(char path[], ArchiveFile info, bool isDir)
		{
		String parts[] = stringUtil.explode(path, "/")
		
		FileTree node = fileTree
		
		for (int i = 0; i < parts.arrayLength; i++)
			{
			FileTree nextNode = null
			if ((nextNode = node.children.findFirst(FileTree.[name], new FileTree(parts[i].string))) == null)
				{
				nextNode = new FileTree(parts[i].string)
				nextNode.dir = (i < parts.arrayLength-1) || isDir
				nextNode.record = new ArchiveFile(parts[i].string, nextNode.dir)
				
				//check if it's a leaf node, and fill in its details if so...
				if (!isDir && i == parts.arrayLength-1)
					{
					nextNode.record.compressedSize = info.compressedSize
					nextNode.record.uncompressedSize = info.uncompressedSize
					nextNode.record.modified = info.modified
					}
				
				node.children = new FileTree[](node.children, nextNode)
				node = nextNode
				}
				else
				{
				node = nextNode
				}
			}
		}
	
	void buildTree()
		{
		fileTree = new FileTree(dir = true)
		
		for (int i = 0; i < archiveIndex.arrayLength; i++)
			{
			addToTree(archiveIndex[i].path, archiveIndex[i].record, archiveIndex[i].header.fileType == DIRTYPE)
			}
		}
	
	void processHeader(TarHeader hdr, int dataOffset)
		{
		char fileName[] = getNTString(hdr.fileName)
		char fileNamePre[] = getNTString(hdr.fileNamePrefix)
		
		//we should only use fileNamePrefix, and other extended fields, if hdr.ustar is set to the string "ustar\0"
		bool ustar = hdr.ustar == "ustar"
		
		int8 sz = i8FromOctal(hdr.fileSize)
		
		int8 mod = i8FromOctal(hdr.lastModified)
		
		//add to directory
		char fn[] = null
		
		if (ustar)
			fn = new char[](fileNamePre, fileName)
			else
			fn = fileName
		
		int4 mode = i4FromOctal(hdr.fileMode, hdr.fileMode.arrayLength-2)
		
		ArchiveFile naf = new ArchiveFile(fn)
		naf.dir = hdr.fileType == DIRTYPE
		naf.modified = timeUnix.fromUnixTime(mod)
		naf.compressedSize = sz
		naf.uncompressedSize = sz
		
		publicIndex = new ArchiveFile[](publicIndex, naf)
		
		FileIndex nfi = new FileIndex(hdr, naf)
		nfi.offset = dataOffset
		nfi.path = fn
		
		archiveIndex = new FileIndex[](archiveIndex, nfi)
		}
	
	int2 readInt2(File fd)
		{
		Int2 i2 = new Int2()
		byte copyBuf[] = dana.serial(i2)
		
		byte buf[] = fd.read(copyBuf.arrayLength)
		
		copyBuf =[] buf
		
		//deal with endianness
		byte swap
		
		swap = copyBuf[1]
		copyBuf[1] = copyBuf[0]
		copyBuf[0] = swap
		
		return i2.val
		}
	
	int4 readInt4(byte buf[])
		{
		Int4 i4 = new Int4()
		byte copyBuf[] = dana.serial(i4)
		
		copyBuf =[] buf
		
		//deal with endianness
		byte swap
		
		swap = copyBuf[3]
		copyBuf[3] = copyBuf[0]
		copyBuf[0] = swap
		
		swap = copyBuf[2]
		copyBuf[2] = copyBuf[1]
		copyBuf[1] = swap
		
		return i4.val
		}
	
	byte[] readOptionalHeader(File fd)
		{
		//2 byte length header, then the data
		int2 len = readInt2(fd)
		
		return fd.read(len)
		}
	
	char[] readToZero(File fd)
		{
		byte buf[] = null
		byte b[] = 1
		while (!fd.eof() && b != 0)
			{
			b = fd.read(1)
			if (b != 0) buf = new byte[](buf, b)
			}
		
		return buf
		}
	
	int nextTarHeader = 0
	
	int processStreamChunk(byte chunk[], int offset)
		{
		while (nextTarHeader < (offset + chunk.arrayLength))
			{
			TarHeader hdr = null
			
			hdr = readArrayHeader(chunk, nextTarHeader - offset)
			
			//NOTE: we're supposed to get two empty headers in a row, to indicate end-of-stream, but we stop at one
			if (blankHeader(hdr)) return SP_END
			
			if (!checksumOK(hdr))
				{
				throw new Exception("invalid tar file (checksum failure)")
				}
			
			if (hdr == null) return SP_END
			
			int8 sz = i8FromOctal(hdr.fileSize)
			processHeader(hdr, nextTarHeader + 512)
			
			//locate the next record
			nextTarHeader = nextTarHeader + 512 + sz
			
			int add = 0
			if (nextTarHeader % 512 != 0) add = 512 - (nextTarHeader % 512)
			
			nextTarHeader = nextTarHeader + add
			}
			//else: wait for the next chunk that's in the range I want
		
		return SP_CONTINUE
		}
	
	Archive:Archive(store File fd)
		{
		StreamCompression alg = new StreamCompression:deflate()
		
		//TODO: assume a gzip header first, then inside that we'll have compressed content which when uncompressed has the tar format
		// - we need to decompress data in at least chunks of 512, but really should pick a larger number (that's a multiple of 512) for efficiency and then scan within that decompressed data
		
		//the first thing in the file should be a TarHeader with a correctly calculated checksum
		// - (calculate and check the checksum inside readFileHeader, returning null if it fails)
		
		GZHeader mainHeader = new GZHeader()
		char fileName[]
		char memberComment[]
		byte copyBuf[] = dana.serial(mainHeader)
		
		//read the standard header
		byte buf[] = fd.read(copyBuf.arrayLength)
		
		if (buf.arrayLength != copyBuf.arrayLength)
			{
			throw new Exception("insufficient header bytes (file corrupted?)")
			}
		
		copyBuf =[] buf
		
		//check for magic gzip header bytes
		if (mainHeader.ID1 == 31 && mainHeader.ID2 == 139)
			{
			//read optional section, if present
			if ((mainHeader.FLG & GZHeader.FLG_FEXTRA) == GZHeader.FLG_FEXTRA)
				fileName = readOptionalHeader(fd)
			
			//read filename, if present
			if ((mainHeader.FLG & GZHeader.FLG_FNAME) == GZHeader.FLG_FNAME)
				fileName = readToZero(fd)
			
			//read comment, if present
			if ((mainHeader.FLG & GZHeader.FLG_FCOMMENT) == GZHeader.FLG_FCOMMENT)
				memberComment = readToZero(fd)
			
			//now comes the compressed content
			compressedDataOffset = fd.getPos()
			
			int4 crcCheck
			byte chunk[]
			int ucOffset = 0
			int ucSize = 0
			
			//decompress chunks that are a multiple of 512, and pass each chunk in to a Tar stream decoder to read any things from that stream chunk...
			
			//a StreamBuffer allows us to buffer up data of a given length, for reading out
			// - this is useful because we don't know what size a decompressed chunk will end up as
			StreamBuffer sbuf = new StreamBuffer()
			
			alg.decompressInit()
			
			while (!fd.eof() && alg.decompressStatus() == StreamCompression.DS_CONTINUE)
				{
				buf = fd.read(CHUNK_SIZE)
				
				chunk = alg.decompress(buf)
				
				sbuf.write(chunk)
				
				while (sbuf.getSize() >= 5120)
					{
					byte dchunk[] = sbuf.read(5120)
					crcCheck = crc32.makeCRC(crcCheck, dchunk)
					if (processStreamChunk(dchunk, ucOffset) == SP_FAIL) throw new Exception("invalid tar file")
					ucOffset += dchunk.arrayLength
					ucSize += dchunk.arrayLength
					}
				}
			
			if (sbuf.getSize() != 0)
				{
				byte dchunk[] = sbuf.read(sbuf.getSize())
				crcCheck = crc32.makeCRC(crcCheck, dchunk)
				if (processStreamChunk(dchunk, ucOffset) == SP_FAIL) throw new Exception("invalid tar file")
				ucSize += dchunk.arrayLength
				}
			
			int lcSize = alg.decompressEnd()
			
			//adjust the file read head to wherever the last compressed chunk actually ended
			fd.setPos(fd.getPos() - (buf.arrayLength - lcSize))
			
			//read CRC and ISIZE (original size of uncompressed data, mod 2^32)
			int4 crc = readInt4(fd.read(4))
			
			int4 isize = readInt4(fd.read(4))
			
			buildTree()
			}
			else
			{
			throw new Exception("file does not have a gzip header")
			}
		
		ifd = fd
		}
	
	ArchiveFile[] Archive:getAllContents()
		{
		return publicIndex
		}
	
	ArchiveFile[] Archive:getContents(char path[])
		{
		FileTree node = null
		if (path == null)
			{
			node = fileTree
			}
			else
			{
			String parts[] = path.explode("/")
			
			node = fileTree
			
			for (int i = 0; i < parts.arrayLength; i++)
				{
				FileTree nextNode = null
				if ((nextNode = node.children.findFirst(FileTree.[name], new FileTree(parts[i].string))) == null)
					{
					throw new Exception("directory '$path' not found in archive")
					}
					else
					{
					node = nextNode
					}
				}
			}
		
		if (node == null) return null
		
		if (!node.dir) throw new Exception("path '$path' is not a directory")
		
		ArchiveFile result[] = new ArchiveFile[node.children.arrayLength]
		
		for (int i = 0; i < result.arrayLength; i++)
			{
			result[i] = node.children[i].record
			}
		
		return result
		}
	
	bool Archive:exists(char path[])
		{
		return archiveIndex.findFirst(FileIndex.[path], new FileIndex(path = path)) != null
		}
	
	ArchiveFile Archive:getInfo(char path[])
		{
		return publicIndex.findFirst(ArchiveFile.[path], new ArchiveFile(path = path))
		}
	
	void copyArray(byte dest[], byte src[], int start)
		{
		int j = start
		for (int i = 0; i < src.arrayLength; i++)
			{
			dest[j] = src[i]
			j++
			}
		}
	
	byte[] getStreamSection(int ucOffset, byte ucData[], int fileOffset, int fileSize, int written)
		{
		byte writeChunk[] = null
		
		if (fileOffset > ucOffset)
			{
			//from > 0
			int start = fileOffset - ucOffset
			int end = 0
			
			if (start + fileSize > ucData.arrayLength)
				end = ucData.arrayLength
				else
				end = start + fileSize
			
			writeChunk = new byte[end-start]
			
			copyBytes(writeChunk, 0, ucData, start, end-start)
			}
			else
			{
			//from zero
			int total = fileSize - written
			int end = 0
			
			if (total > ucData.arrayLength)
				end = ucData.arrayLength
				else
				end = total
			
			writeChunk = new byte[end]
			
			copyBytes(writeChunk, 0, ucData, 0, end)
			}
		
		return writeChunk
		}
	
	byte[] Archive:extractFile(char path[])
		{
		FileIndex fi = archiveIndex.findFirst(FileIndex.[path], new FileIndex(path = path))
		
		if (fi != null)
			{
			ifd.setPos(compressedDataOffset)
			
			int size = fi.record.uncompressedSize
			int4 crcCheck
			byte chunk[]
			int ucOffset = 0
			int written = 0
			
			byte result[] = new byte[size]
			
			StreamCompression alg = new StreamCompression:deflate()
			
			alg.decompressInit()
			
			while (!ifd.eof() && alg.decompressStatus() == StreamCompression.DS_CONTINUE)
				{
				byte buf[] = ifd.read(CHUNK_SIZE)
				
				chunk = alg.decompress(buf)
				
				byte writeChunk[] = null
				
				if (fi.offset < ucOffset + chunk.arrayLength)
					{
					//decide how much to write
					
					writeChunk = getStreamSection(ucOffset, chunk, fi.offset, size, written)
					
					copyBytes(result, written, writeChunk, 0, writeChunk.arrayLength)
					
					written += writeChunk.arrayLength
					}
				
				ucOffset += chunk.arrayLength
				
				if (written == size) break
				}
			
			int lcSize = alg.decompressEnd()
			
			return result
			}
			else
			{
			throw new Exception("file '$path' not found in archive")
			}
		
		return null
		}
	
	bool Archive:extractFileTo(char path[], File ofd)
		{
		FileIndex fi = archiveIndex.findFirst(FileIndex.[path], new FileIndex(path = path))
		
		if (fi != null)
			{
			ifd.setPos(compressedDataOffset)
			
			int size = fi.record.uncompressedSize
			int4 crcCheck
			byte chunk[]
			int ucOffset = 0
			int written = 0
			
			StreamCompression alg = new StreamCompression:deflate()
			
			alg.decompressInit()
			
			while (!ifd.eof() && alg.decompressStatus() == StreamCompression.DS_CONTINUE)
				{
				byte buf[] = ifd.read(CHUNK_SIZE)
				
				chunk = alg.decompress(buf)
				
				byte writeChunk[] = null
				
				if (fi.offset < ucOffset + chunk.arrayLength)
					{
					//decide how much to write
					
					writeChunk = getStreamSection(ucOffset, chunk, fi.offset, size, written)
					written += writeChunk.arrayLength
					
					ofd.write(writeChunk)
					}
				
				ucOffset += chunk.arrayLength
				
				if (written == size) break
				}
			
			int lcSize = alg.decompressEnd()
			
			return true
			}
			else
			{
			throw new Exception("file '$path' not found in archive")
			}
		
		return false
		}
	
	}
Revision history
To propose a new revision to this entity, use dana source put -uc your/new/version.dn -n compress.Archive:targz -m "reason for update" -u yourUsername
Version 2 (this version) by barry
Notes for this version: Updates to prepare for upcoming compiler strictness changes in function parameter qualifier equivalence
Version 1 by barry