aboutsummaryrefslogtreecommitdiffhomepage
path: root/org/in_zip_pod.org
diff options
context:
space:
mode:
authorRalph Amissah <ralph.amissah@gmail.com>2026-04-13 15:33:07 -0400
committerRalph Amissah <ralph.amissah@gmail.com>2026-04-13 16:25:56 -0400
commitd0ac448e6425c9e4246cd529aeb11643dce8093f (patch)
treec12356fbc55cffd495cc37b81ca6fb125e3be195 /org/in_zip_pod.org
parentpackage.nix cosmetic line-breaks for build command (diff)
spine may be run against a document-markup zip pod
- claude contributed src - Opens the zip with std.zip.ZipArchive (reads the whole file into memory) - Locates pod.manifest inside the archive to discover document paths and languages - Extracts markup files (.sst/.ssm/.ssi) as in-memory strings - Extracts images as in-memory byte arrays - Extracts conf/dr_document_make if present - Presents these to the existing pipeline as if they were read from the filesystem - Some security mitigations: - Zip Slip / Path Traversal: Reject entries containing `..` or starting with `/`; canonicalize resolved paths and verify they fall within extraction root - Zip Bomb: Check `ArchiveMember.size` before extracting; enforce per-file (50MB) and total size limits (500MB) - Entry Count: Limit number of entries (a pod should have at most ~100 files) - Path depth: limit (Maximum 10 path components). - Symlinks: Verify no symlinks in extracted content before processing (post-extraction recursive scan) - Filename Validation: Only allow expected characters; reject null bytes - Malformed Zips: Catch `ZipException` from `std.zip.ZipArchive` constructor - Cleanup on error
Diffstat (limited to 'org/in_zip_pod.org')
-rw-r--r--org/in_zip_pod.org283
1 files changed, 283 insertions, 0 deletions
diff --git a/org/in_zip_pod.org b/org/in_zip_pod.org
new file mode 100644
index 0000000..bc5b069
--- /dev/null
+++ b/org/in_zip_pod.org
@@ -0,0 +1,283 @@
+-*- mode: org -*-
+#+TITLE: sisudoc spine (doc_reform) markup source zip pod
+#+DESCRIPTION: documents - structuring, publishing in multiple formats & search
+#+FILETAGS: :spine:sourcefile:read:
+#+AUTHOR: Ralph Amissah
+#+EMAIL: [[mailto:ralph.amissah@gmail.com][ralph.amissah@gmail.com]]
+#+COPYRIGHT: Copyright (C) 2015 (continuously updated, current 2026) Ralph Amissah
+#+LANGUAGE: en
+#+STARTUP: content hideblocks hidestars noindent entitiespretty
+#+PROPERTY: header-args :exports code
+#+PROPERTY: header-args+ :noweb yes
+#+PROPERTY: header-args+ :results silent
+#+PROPERTY: header-args+ :cache no
+#+PROPERTY: header-args+ :padline no
+#+PROPERTY: header-args+ :mkdirp yes
+#+OPTIONS: H:3 num:nil toc:t \n:t ::t |:t ^:nil -:t f:t *:t
+
+- [[./doc-reform.org][doc-reform.org]] [[./][org/]]
+
+* read zip
+
+#+HEADER: :tangle "../src/sisudoc/io_in/read_zip_pod.d"
+#+HEADER: :noweb yes
+#+BEGIN_SRC d
+<<doc_header_including_copyright_and_license>>
+/++
+ module read_zip_pod;<BR>
+ - extract pod zip archives to temp directory for processing<BR>
+ - validate zip entries for security (path traversal, size limits)
++/
+module sisudoc.io_in.read_zip_pod;
+@safe:
+template spineExtractZipPod() {
+ import std.algorithm : canFind;
+ import std.array : array;
+ import std.conv : to;
+ import std.file;
+ import std.path;
+ import std.regex;
+ import std.stdio;
+ import std.string : indexOf;
+
+ /+ security limits for zip extraction +/
+ enum size_t MAX_ENTRY_SIZE = 50 * 1024 * 1024; /+ 50 MB per entry +/
+ enum size_t MAX_TOTAL_SIZE = 500 * 1024 * 1024; /+ 500 MB total +/
+ enum size_t MAX_ENTRY_COUNT = 500; /+ max entries in archive +/
+ enum size_t MAX_PATH_DEPTH = 10; /+ max path components +/
+
+ /+ allowed entry name pattern: alphanumeric, dots, dashes, underscores, forward slashes +/
+ static auto rgx_safe_entry_name = ctRegex!(`^[a-zA-Z0-9._/ -]+$`);
+
+ struct ZipPodResult {
+ string tmp_dir; /+ temp directory containing extracted pod +/
+ string pod_dir; /+ path to pod directory within tmp_dir +/
+ bool ok; /+ extraction succeeded +/
+ string error_msg; /+ error description if !ok +/
+ }
+
+ /+ ↓ validate a single zip entry name for security +/
+ string validateEntryName(string name) {
+ /+ reject empty names +/
+ if (name.length == 0)
+ return "empty entry name";
+ /+ reject absolute paths +/
+ if (name[0] == '/')
+ return "absolute path in zip entry: " ~ name;
+ /+ reject path traversal +/
+ if (name.canFind(".."))
+ return "path traversal in zip entry: " ~ name;
+ /+ reject null bytes +/
+ if (name.indexOf('\0') >= 0)
+ return "null byte in zip entry name: " ~ name;
+ /+ reject backslashes (windows path separator tricks) +/
+ if (name.canFind("\\"))
+ return "backslash in zip entry: " ~ name;
+ /+ check path depth +/
+ size_t depth = 0;
+ foreach (c; name) {
+ if (c == '/') depth++;
+ }
+ if (depth > MAX_PATH_DEPTH)
+ return "path too deep in zip entry: " ~ name;
+ /+ check allowed characters +/
+ if (!(name.matchFirst(rgx_safe_entry_name)))
+ return "disallowed characters in zip entry: " ~ name;
+ return ""; /+ empty string means valid +/
+ }
+
+ /+ ↓ extract zip pod to temp directory, returns ZipPodResult +/
+ @trusted ZipPodResult extractZipPod(string zip_path) {
+ import std.zip;
+ ZipPodResult result;
+ result.ok = false;
+ /+ ↓ verify zip file exists +/
+ if (!exists(zip_path) || !zip_path.isFile) {
+ result.error_msg = "zip file not found: " ~ zip_path;
+ return result;
+ }
+ /+ ↓ derive pod name from zip filename +/
+ string zip_basename = zip_path.baseName.stripExtension;
+ /+ ↓ read and parse zip archive +/
+ ZipArchive zip;
+ try {
+ zip = new ZipArchive(read(zip_path));
+ } catch (ZipException ex) {
+ result.error_msg = "failed to read zip archive: " ~ zip_path ~ " - " ~ ex.msg;
+ return result;
+ } catch (Exception ex) {
+ result.error_msg = "error reading zip file: " ~ zip_path ~ " - " ~ ex.msg;
+ return result;
+ }
+ /+ ↓ validate entry count +/
+ if (zip.directory.length > MAX_ENTRY_COUNT) {
+ result.error_msg = "zip archive has too many entries ("
+ ~ zip.directory.length.to!string ~ " > " ~ MAX_ENTRY_COUNT.to!string ~ "): " ~ zip_path;
+ return result;
+ }
+ /+ ↓ validate all entries before extracting any +/
+ size_t total_size = 0;
+ foreach (name, member; zip.directory) {
+ /+ validate entry name +/
+ string name_err = validateEntryName(name);
+ if (name_err.length > 0) {
+ result.error_msg = name_err;
+ return result;
+ }
+ /+ check per-entry size +/
+ if (member.expandedSize > MAX_ENTRY_SIZE) {
+ result.error_msg = "zip entry too large ("
+ ~ member.expandedSize.to!string ~ " bytes): " ~ name;
+ return result;
+ }
+ /+ check total size +/
+ total_size += member.expandedSize;
+ if (total_size > MAX_TOTAL_SIZE) {
+ result.error_msg = "zip archive total size exceeds limit ("
+ ~ MAX_TOTAL_SIZE.to!string ~ " bytes): " ~ zip_path;
+ return result;
+ }
+ }
+ /+ ↓ create temp directory +/
+ string tmp_base = tempDir.buildPath("spine-zip-pod");
+ try {
+ if (!exists(tmp_base))
+ mkdirRecurse(tmp_base);
+ } catch (FileException ex) {
+ result.error_msg = "failed to create temp base directory: " ~ ex.msg;
+ return result;
+ }
+ /+ pod directory inside temp: tmp_base/pod_name/ +/
+ string pod_dir = tmp_base.buildPath(zip_basename);
+ try {
+ if (exists(pod_dir))
+ rmdirRecurse(pod_dir);
+ mkdirRecurse(pod_dir);
+ } catch (FileException ex) {
+ result.error_msg = "failed to create temp pod directory: " ~ ex.msg;
+ return result;
+ }
+ /+ ↓ extract entries +/
+ /+ zip internal structure uses paths like:
+ pod.manifest, conf/dr_document_make,
+ pod/media/text/en/filename.sst, image/filename.png
+ but the extracted pod directory needs to look like a normal pod:
+ pod.manifest, conf/dr_document_make,
+ media/text/en/filename.sst, image/filename.png
+ The "pod/" prefix in zip entries for text files maps to the pod root.
+ +/
+ /+ ↓ pre-compute canonical pod path for containment checks +/
+ auto canonical_pod = (pod_dir.asNormalizedPath).array.to!string ~ "/";
+ foreach (name, member; zip.directory) {
+ /+ skip directory entries +/
+ if (name.length > 0 && name[$-1] == '/')
+ continue;
+ /+ ↓ map zip internal path to filesystem path +/
+ /+ entries with "pod/" prefix: strip it so media/text/en/file.sst ends up at pod_dir/media/text/en/file.sst +/
+ string entry_path = name;
+ if (entry_path.length > 4 && entry_path[0..4] == "pod/") {
+ entry_path = entry_path[4..$];
+ }
+ string out_path = pod_dir.buildPath(entry_path);
+ /+ ↓ verify resolved path is within pod_dir (defense in depth) +/
+ auto canonical_out = (out_path.asNormalizedPath).array.to!string;
+ if (canonical_out.length < canonical_pod.length
+ || canonical_out[0..canonical_pod.length] != canonical_pod)
+ {
+ result.error_msg = "zip entry escapes extraction directory: " ~ name;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ /+ ↓ create parent directories +/
+ string parent = out_path.dirName;
+ try {
+ if (!exists(parent))
+ mkdirRecurse(parent);
+ } catch (FileException ex) {
+ result.error_msg = "failed to create directory for: " ~ name ~ " - " ~ ex.msg;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ /+ ↓ decompress and write file +/
+ try {
+ auto data = zip.expand(member);
+ std.file.write(out_path, data);
+ } catch (Exception ex) {
+ result.error_msg = "failed to extract: " ~ name ~ " - " ~ ex.msg;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ }
+ /+ ↓ verify no symlinks were created (defense in depth) +/
+ string symlink_err = checkForSymlinks(pod_dir);
+ if (symlink_err.length > 0) {
+ result.error_msg = symlink_err;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ /+ ↓ verify pod.manifest exists in extracted content +/
+ if (!exists(pod_dir.buildPath("pod.manifest"))) {
+ result.error_msg = "zip archive does not contain pod.manifest: " ~ zip_path;
+ try { rmdirRecurse(pod_dir); } catch (FileException) {}
+ return result;
+ }
+ result.tmp_dir = tmp_base;
+ result.pod_dir = pod_dir;
+ result.ok = true;
+ return result;
+ }
+
+ /+ ↓ recursively check for symlinks in extracted directory +/
+ @trusted string checkForSymlinks(string dir_path) {
+ try {
+ foreach (entry; dirEntries(dir_path, SpanMode.depth)) {
+ if (entry.isSymlink) {
+ return "symlink found in zip extraction: " ~ entry.name;
+ }
+ }
+ } catch (FileException ex) {
+ return "error checking for symlinks: " ~ ex.msg;
+ }
+ return "";
+ }
+
+ /+ ↓ clean up extracted temp directory +/
+ void cleanupZipPod(ref ZipPodResult zpr) {
+ if (zpr.pod_dir.length > 0 && exists(zpr.pod_dir)) {
+ try {
+ rmdirRecurse(zpr.pod_dir);
+ } catch (FileException ex) {
+ stderr.writeln("WARNING: failed to clean up temp zip extraction: ", zpr.pod_dir);
+ }
+ }
+ zpr.ok = false;
+ }
+}
+#+END_SRC
+
+* org includes
+** project version
+
+#+NAME: spine_version
+#+HEADER: :noweb yes
+#+BEGIN_SRC emacs-lisp
+<<./sisudoc_spine_version_info_and_doc_header_including_copyright_and_license.org:spine_project_version()>>
+#+END_SRC
+
+** year
+
+#+NAME: year
+#+HEADER: :noweb yes
+#+BEGIN_SRC emacs-lisp
+<<./sisudoc_spine_version_info_and_doc_header_including_copyright_and_license.org:year()>>
+#+END_SRC
+
+** document header including copyright & license
+
+#+NAME: doc_header_including_copyright_and_license
+#+HEADER: :noweb yes
+#+BEGIN_SRC emacs-lisp
+<<./sisudoc_spine_version_info_and_doc_header_including_copyright_and_license.org:spine_doc_header_including_copyright_and_license()>>
+#+END_SRC
+
+* __END__