Project Dana

Component compress.Archive:zip by barry
/*
This is a partial implementation of the ZIP file format. It is designed for maximum compatibility with other ZIP archive readers, uses only the most simple features of the ZIP format, and supports only the deflate (zlib) compression algorithm. It currently uses only "MS-DOS" (FAT) file system assumptions, and does not encode file attributes. This is sufficient for many common uses of ZIP files, but certainly doesn't cover all requirements.

The most obvious future upgrades would be:
 - support for other file system formats
 - support for file attributes (if/when Dana's file system APIs support this)
 - support for additional compression algorithms
 - support for ZIP64, for archive sizes > 4GB
*/

// https://support.pkware.com/display/PKZIP/APPNOTE

uses time.DateTime

const int ZIP_CM_NONE = 0
const int ZIP_CM_DEFLATE = 8

const int CHUNK_SIZE = 5120

data LFH {
	const byte MAGIC[] = new byte[](0x50, 0x4b, 0x03, 0x04)
	byte sig[4]
	int2 versionNeeded
	int2 flags
	int2 compressionType
	int2 lastModifiedTime
	int2 lastModifiedDate
	int4 originCRC
	int4 compressedSize
	int4 originSize
	int2 fileNameLen
	int2 exFieldLen
	}

data CDR {
	const byte MAGIC[] = new byte[](0x50, 0x4b, 0x01, 0x02)
	byte sig[4]
	int2 versionMaker
	int2 versionNeeded
	int2 flags
	int2 compressionType
	int2 lastModifiedTime
	int2 lastModifiedDate
	int4 originCRC
	int4 compressedSize
	int4 originSize
	int2 fileNameLen
	int2 exFieldLen
	int2 commentLen
	int2 diskIndex
	int2 intAttributes
	int4 extAttributes
	int4 headerOffset
	}

data EOCD {
	const byte MAGIC[] = new byte[](0x50, 0x4b, 0x05, 0x06)
	byte sig[4]
	int2 diskNo
	int2 cdStartDisk
	int2 cdCountDisk
	int2 cdCountTotal
	int4 cdSizeBytes
	int4 cdOffsetBytes
	int2 commentLen
	}

data FileIndex {
	CDR cdr
	char path[]
	}

data FileTree nocycle {
	char name[]
	bool dir
	ArchiveFile record
	FileTree children[]
	}

const int MAX_HDR_LOCATE_TRIES = 1000

data Int4 {
	int4 n
	}

data Int2 {
	int2 n
	}

component provides Archive:zip requires io.Output out, data.IntUtil iu, data.StringUtil stringUtil, time.TimeDOS timeDOS, compress.algorithm.StreamCompression:deflate, data.checksum.CRC32 crc32, data.query.Search search {
	
	File ifd
	FileIndex archiveIndex[]
	ArchiveFile publicIndex[]
	FileTree fileTree
	
	void reverseEndian(byte num[])
		{
		int j = num.arrayLength - 1
		for (int i = 0; i < num.arrayLength / 2; i++)
			{
			byte tmp = num[j]
			num[j] = num[i]
			num[i] = tmp
			j --
			}
		}
	
	int4 reverseInt4(int4 i4)
		{
		Int4 ct = new Int4(i4)
		reverseEndian(dana.serial(ct))
		return ct.n
		}
	
	int2 reverseInt2(int2 i2)
		{
		Int2 ct = new Int2(i2)
		reverseEndian(dana.serial(ct))
		return ct.n
		}
	
	bool validEOCD(EOCD record)
		{
		//TODO: add more checks...
		
		if (record.sig == EOCD.MAGIC)
			return true
		
		return false
		}
	
	EOCD getEOCD(File fd)
		{
		//locate the end-of-central-directory (EOCD) record (which is not so easy, for zip32)
		// - we need to use multiple verification points to check this: do we have the right magic constant; does the comment length equal the remainder; does the thing pointed to by the start-of-central-directory field have the right magic constant
		// - we also need a max value on how far back we're willing to walk (i.e., max length of EOCD comment) before we give up
		
		EOCD record = new EOCD()
		
		byte srd[] = dana.serial(record)
		
		bool headerFound = false
		
		int nextPos = fd.getSize() - srd.arrayLength
		int tries = 0
		
		// while...
		while (!headerFound && tries < MAX_HDR_LOCATE_TRIES)
			{
			fd.setPos(nextPos)
			srd =[] fd.read(srd.arrayLength)
			
			if (validEOCD(record))
				{
				headerFound = true
				}
				else
				{
				nextPos --
				tries ++
				}
			}
		
		if (!headerFound) return null
		
		//reverse byte ordering of fields...
		record.diskNo = reverseInt2(record.diskNo)
		record.cdStartDisk = reverseInt2(record.cdStartDisk)
		record.cdCountDisk = reverseInt2(record.cdCountDisk)
		record.cdCountTotal = reverseInt2(record.cdCountTotal)
		record.cdSizeBytes = reverseInt4(record.cdSizeBytes)
		record.cdOffsetBytes = reverseInt4(record.cdOffsetBytes)
		record.commentLen = reverseInt2(record.commentLen)
		
		return record
		}
	
	CDR readCDR(File fd)
		{
		CDR record = new CDR()
		
		byte srd[] = dana.serial(record)
		
		srd =[] fd.read(srd.arrayLength)
		
		//TODO: check magic etc.
		record.versionMaker = reverseInt2(record.versionMaker)
		record.versionNeeded = reverseInt2(record.versionNeeded)
		record.flags = reverseInt2(record.flags)
		record.compressionType = reverseInt2(record.compressionType)
		record.lastModifiedTime = reverseInt2(record.lastModifiedTime)
		record.lastModifiedDate = reverseInt2(record.lastModifiedDate)
		record.originCRC = reverseInt4(record.originCRC)
		record.compressedSize = reverseInt4(record.compressedSize)
		record.originSize = reverseInt4(record.originSize)
		record.fileNameLen = reverseInt2(record.fileNameLen)
		record.exFieldLen = reverseInt2(record.exFieldLen)
		record.commentLen = reverseInt2(record.commentLen)
		record.diskIndex = reverseInt2(record.diskIndex)
		record.intAttributes = reverseInt2(record.intAttributes)
		record.extAttributes = reverseInt4(record.extAttributes)
		record.headerOffset = reverseInt4(record.headerOffset)
		
		return record
		}
	
	LFH readLFH(File fd, int offset)
		{
		LFH record = new LFH()
		
		byte srd[] = dana.serial(record)
		
		fd.setPos(offset)
		
		srd =[] fd.read(srd.arrayLength)
		
		//TODO: check magic etc.
		record.versionNeeded = reverseInt2(record.versionNeeded)
		record.flags = reverseInt2(record.flags)
		record.compressionType = reverseInt2(record.compressionType)
		record.lastModifiedTime = reverseInt2(record.lastModifiedTime)
		record.lastModifiedDate = reverseInt2(record.lastModifiedDate)
		record.originCRC = reverseInt4(record.originCRC)
		record.compressedSize = reverseInt4(record.compressedSize)
		record.originSize = reverseInt4(record.originSize)
		record.fileNameLen = reverseInt2(record.fileNameLen)
		record.exFieldLen = reverseInt2(record.exFieldLen)
		
		return record
		}
	
	void addToTree(char path[], CDR cdr)
		{
		bool isDir = path[path.arrayLength-1] == "/"
		String parts[] = stringUtil.explode(path, "/")
		
		FileTree node = fileTree
		
		for (int i = 0; i < parts.arrayLength; i++)
			{
			FileTree nextNode = null
			if ((nextNode = node.children.findFirst(FileTree.[name], new FileTree(parts[i].string))) == null)
				{
				nextNode = new FileTree(parts[i].string)
				nextNode.dir = (i < parts.arrayLength-1) || isDir
				nextNode.record = new ArchiveFile(parts[i].string, nextNode.dir)
				
				//check if it's a leaf node, and fill in its details if so...
				if (!isDir && i == parts.arrayLength-1)
					{
					nextNode.record.compressedSize = cdr.compressedSize
					nextNode.record.uncompressedSize = cdr.originSize
					nextNode.record.modified = timeDOS.fromDOSTime(cdr.lastModifiedDate, cdr.lastModifiedTime)
					}
				
				node.children = new FileTree[](node.children, nextNode)
				node = nextNode
				}
				else
				{
				node = nextNode
				}
			}
		}
	
	void buildTree()
		{
		fileTree = new FileTree(dir = true)
		
		for (int i = 0; i < archiveIndex.arrayLength; i++)
			{
			addToTree(archiveIndex[i].path, archiveIndex[i].cdr)
			}
		}
	
	Archive:Archive(File fd)
		{
		EOCD eocd = getEOCD(fd)
		
		if (eocd == null) throw new Exception("unrecognised archive format")
		
		ifd = fd
		
		fd.setPos(eocd.cdOffsetBytes)
		
		archiveIndex = new FileIndex[eocd.cdCountTotal]
		
		publicIndex = new ArchiveFile[archiveIndex.arrayLength]
		
		for (int i = 0; i < eocd.cdCountTotal; i++)
			{
			CDR cdr = readCDR(fd)
			
			char fn[] = fd.read(cdr.fileNameLen)
			byte xfdata[] = fd.read(cdr.exFieldLen)
			char comment[] = fd.read(cdr.commentLen)
			
			archiveIndex[i] = new FileIndex()
			archiveIndex[i].cdr = cdr
			archiveIndex[i].path = fn
			
			ArchiveFile naf = new ArchiveFile(fn)
			naf.dir = fn.arrayLength > 0 && fn[fn.arrayLength-1] == "/"
			naf.modified = timeDOS.fromDOSTime(cdr.lastModifiedDate, cdr.lastModifiedTime)
			naf.compressedSize = cdr.compressedSize
			naf.uncompressedSize = cdr.originSize
			publicIndex[i] = naf
			}
		
		buildTree()
		}
	
	ArchiveFile[] Archive:getAllContents()
		{
		return publicIndex
		}
	
	ArchiveFile[] Archive:getContents(char path[])
		{
		FileTree node = null
		if (path == null)
			{
			node = fileTree
			}
			else
			{
			String parts[] = path.explode("/")
			
			node = fileTree
			
			for (int i = 0; i < parts.arrayLength; i++)
				{
				FileTree nextNode = null
				if ((nextNode = node.children.findFirst(FileTree.[name], new FileTree(parts[i].string))) == null)
					{
					throw new Exception("directory '$path' not found in archive")
					}
					else
					{
					node = nextNode
					}
				}
			}
		
		if (!node.dir) throw new Exception("path '$path' is not a directory")
		
		ArchiveFile result[] = new ArchiveFile[node.children.arrayLength]
		
		for (int i = 0; i < result.arrayLength; i++)
			{
			result[i] = node.children[i].record
			}
		
		return result
		}
	
	bool Archive:exists(char path[])
		{
		return archiveIndex.findFirst(FileIndex.[path], new FileIndex(path = path)) != null
		}
	
	ArchiveFile Archive:getInfo(char path[])
		{
		return publicIndex.findFirst(ArchiveFile.[path], new ArchiveFile(path = path))
		}
	
	void copyArray(byte dest[], byte src[], int start)
		{
		int j = start
		for (int i = 0; i < src.arrayLength; i++)
			{
			dest[j] = src[i]
			j++
			}
		}
	
	byte[] Archive:extractFile(char path[])
		{
		FileIndex fi = archiveIndex.findFirst(FileIndex.[path], new FileIndex(path = path))
		
		if (fi != null)
			{
			LFH lfh = readLFH(ifd, fi.cdr.headerOffset)
			
			int4 crc = 0
			
			StreamCompression scom = null
			int2 zipMethod = 0
			
			if (lfh.compressionType == ZIP_CM_DEFLATE)
				{
				scom = new StreamCompression:deflate()
				}
				else if (lfh.compressionType == ZIP_CM_NONE)
				{
				
				}
				else
				{
				throw new Exception("unknown compression method $(lfh.compressionType) for '$path'")
				}
			
			char fn[] = ifd.read(lfh.fileNameLen)
			byte xfdata[] = ifd.read(lfh.exFieldLen)
			
			byte result[] = new byte[lfh.originSize]
			
			if (lfh.compressionType != ZIP_CM_NONE)
				scom.decompressInit()
			
			int offset = 0
			int rdAmt = 0
			while (rdAmt < fi.cdr.compressedSize)
				{
				int thisRead = CHUNK_SIZE
				
				if ((rdAmt + CHUNK_SIZE) > fi.cdr.compressedSize) thisRead = fi.cdr.compressedSize - rdAmt
				
				byte cdata[] = ifd.read(thisRead)
				
				byte ddata[]
				
				if (lfh.compressionType != ZIP_CM_NONE)
					ddata = scom.decompress(cdata)
					else
					ddata = cdata
				
				crc = crc32.makeCRC(crc, ddata)
				
				copyArray(result, ddata, offset)
				
				offset += ddata.arrayLength
				rdAmt += CHUNK_SIZE
				}
			
			if (lfh.compressionType != ZIP_CM_NONE)
				scom.decompressEnd()
			
			return result
			}
			else
			{
			throw new Exception("file '$path' not found in archive")
			}
		
		return null
		}
	
	bool Archive:extractFileTo(char path[], File ofd)
		{
		FileIndex fi = archiveIndex.findFirst(FileIndex.[path], new FileIndex(path = path))
		
		if (fi != null)
			{
			LFH lfh = readLFH(ifd, fi.cdr.headerOffset)
			
			int4 crc = 0
			
			StreamCompression scom = null
			
			if (lfh.compressionType == ZIP_CM_DEFLATE)
				{
				scom = new StreamCompression:deflate()
				}
				else if (lfh.compressionType == ZIP_CM_NONE)
				{
				
				}
				else
				{
				throw new Exception("unknown compression method $(lfh.compressionType) for '$path'")
				}
			
			char fn[] = ifd.read(lfh.fileNameLen)
			byte xfdata[] = ifd.read(lfh.exFieldLen)
			
			if (lfh.compressionType != ZIP_CM_NONE)
				scom.decompressInit()
			
			//the LFH is an unreliable source for compressedSize and CRC info, since that data may be absent and included in a "data descriptor" instead (in which case we'd need to decompress the data until deflate told us it's finished)
			// - instead of having two different styles, however, we always read this data from the CDR (since we don't support multi-disk archives anyway at the moment)
			int rdAmt = 0
			while (rdAmt < fi.cdr.compressedSize)
				{
				int thisRead = CHUNK_SIZE
				
				if ((rdAmt + CHUNK_SIZE) > fi.cdr.compressedSize) thisRead = fi.cdr.compressedSize - rdAmt
				
				byte cdata[] = ifd.read(thisRead)
				
				byte ddata[]
				
				if (lfh.compressionType != ZIP_CM_NONE)
					ddata = scom.decompress(cdata)
					else
					ddata = cdata
				
				crc = crc32.makeCRC(crc, ddata)
				
				ofd.write(ddata)
				
				rdAmt += CHUNK_SIZE
				}
			
			if (lfh.compressionType != ZIP_CM_NONE)
				scom.decompressEnd()
			
			return true
			}
			else
			{
			throw new Exception("file '$path' not found in archive")
			}
		
		return false
		}
	
	}
Revision history
To propose a new revision to this entity, use dana source put -uc your/new/version.dn -n compress.Archive:zip -m "reason for update" -u yourUsername
Version 2 by barry
Version 1 (this version) by barry
Notes for this version: Standard Library Initialisation