diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/sisudoc/io_in/read_zip_pod.d | 279 | ||||
| -rwxr-xr-x | src/sisudoc/spine.d | 180 |
2 files changed, 457 insertions, 2 deletions
diff --git a/src/sisudoc/io_in/read_zip_pod.d b/src/sisudoc/io_in/read_zip_pod.d new file mode 100644 index 0000000..38480cd --- /dev/null +++ b/src/sisudoc/io_in/read_zip_pod.d @@ -0,0 +1,279 @@ +/+ +- Name: SisuDoc Spine, Doc Reform [a part of] + - Description: documents, structuring, processing, publishing, search + - static content generator + + - Author: Ralph Amissah + [ralph.amissah@gmail.com] + + - Copyright: (C) 2015 (continuously updated, current 2026) Ralph Amissah, All Rights Reserved. + + - License: AGPL 3 or later: + + Spine (SiSU), a framework for document structuring, publishing and + search + + Copyright (C) Ralph Amissah + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU AFERO General Public License as published by the + Free Software Foundation, either version 3 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program. If not, see [https://www.gnu.org/licenses/]. + + If you have Internet connection, the latest version of the AGPL should be + available at these locations: + [https://www.fsf.org/licensing/licenses/agpl.html] + [https://www.gnu.org/licenses/agpl.html] + + - Spine (by Doc Reform, related to SiSU) uses standard: + - docReform markup syntax + - standard SiSU markup syntax with modified headers and minor modifications + - docReform object numbering + - standard SiSU object citation numbering & system + + - Homepages: + [https://www.sisudoc.org] + [https://www.doc-reform.org] + + - Git + [https://git.sisudoc.org/] + ++/ +/++ + module read_zip_pod;<BR> + - extract pod zip archives to temp directory for processing<BR> + - validate zip entries for security (path traversal, size limits) ++/ +module sisudoc.io_in.read_zip_pod; +@safe: +template spineExtractZipPod() { + import std.algorithm : canFind; + import std.array : array; + import std.conv : to; + import std.file; + import std.path; + import std.regex; + import std.stdio; + import std.string : indexOf; + + /+ security limits for zip extraction +/ + enum size_t MAX_ENTRY_SIZE = 50 * 1024 * 1024; /+ 50 MB per entry +/ + enum size_t MAX_TOTAL_SIZE = 500 * 1024 * 1024; /+ 500 MB total +/ + enum size_t MAX_ENTRY_COUNT = 500; /+ max entries in archive +/ + enum size_t MAX_PATH_DEPTH = 10; /+ max path components +/ + + /+ allowed entry name pattern: alphanumeric, dots, dashes, underscores, forward slashes +/ + static auto rgx_safe_entry_name = ctRegex!(`^[a-zA-Z0-9._/ -]+$`); + + struct ZipPodResult { + string tmp_dir; /+ temp directory containing extracted pod +/ + string pod_dir; /+ path to pod directory within tmp_dir +/ + bool ok; /+ extraction succeeded +/ + string error_msg; /+ error description if !ok +/ + } + + /+ ↓ validate a single zip entry name for security +/ + string validateEntryName(string name) { + /+ reject empty names +/ + if (name.length == 0) + return "empty entry name"; + /+ reject absolute paths +/ + if (name[0] == '/') + return "absolute path in zip entry: " ~ name; + /+ reject path traversal +/ + if (name.canFind("..")) + return "path traversal in zip entry: " ~ name; + /+ reject null bytes +/ + if (name.indexOf('\0') >= 0) + return "null byte in zip entry name: " ~ name; + /+ reject backslashes (windows path separator tricks) +/ + if (name.canFind("\\")) + return "backslash in zip entry: " ~ name; + /+ check path depth +/ + size_t depth = 0; + foreach (c; name) { + if (c == '/') depth++; + } + if (depth > MAX_PATH_DEPTH) + return "path too deep in zip entry: " ~ name; + /+ check allowed characters +/ + if (!(name.matchFirst(rgx_safe_entry_name))) + return "disallowed characters in zip entry: " ~ name; + return ""; /+ empty string means valid +/ + } + + /+ ↓ extract zip pod to temp directory, returns ZipPodResult +/ + @trusted ZipPodResult extractZipPod(string zip_path) { + import std.zip; + ZipPodResult result; + result.ok = false; + /+ ↓ verify zip file exists +/ + if (!exists(zip_path) || !zip_path.isFile) { + result.error_msg = "zip file not found: " ~ zip_path; + return result; + } + /+ ↓ derive pod name from zip filename +/ + string zip_basename = zip_path.baseName.stripExtension; + /+ ↓ read and parse zip archive +/ + ZipArchive zip; + try { + zip = new ZipArchive(read(zip_path)); + } catch (ZipException ex) { + result.error_msg = "failed to read zip archive: " ~ zip_path ~ " - " ~ ex.msg; + return result; + } catch (Exception ex) { + result.error_msg = "error reading zip file: " ~ zip_path ~ " - " ~ ex.msg; + return result; + } + /+ ↓ validate entry count +/ + if (zip.directory.length > MAX_ENTRY_COUNT) { + result.error_msg = "zip archive has too many entries (" + ~ zip.directory.length.to!string ~ " > " ~ MAX_ENTRY_COUNT.to!string ~ "): " ~ zip_path; + return result; + } + /+ ↓ validate all entries before extracting any +/ + size_t total_size = 0; + foreach (name, member; zip.directory) { + /+ validate entry name +/ + string name_err = validateEntryName(name); + if (name_err.length > 0) { + result.error_msg = name_err; + return result; + } + /+ check per-entry size +/ + if (member.expandedSize > MAX_ENTRY_SIZE) { + result.error_msg = "zip entry too large (" + ~ member.expandedSize.to!string ~ " bytes): " ~ name; + return result; + } + /+ check total size +/ + total_size += member.expandedSize; + if (total_size > MAX_TOTAL_SIZE) { + result.error_msg = "zip archive total size exceeds limit (" + ~ MAX_TOTAL_SIZE.to!string ~ " bytes): " ~ zip_path; + return result; + } + } + /+ ↓ create temp directory +/ + string tmp_base = tempDir.buildPath("spine-zip-pod"); + try { + if (!exists(tmp_base)) + mkdirRecurse(tmp_base); + } catch (FileException ex) { + result.error_msg = "failed to create temp base directory: " ~ ex.msg; + return result; + } + /+ pod directory inside temp: tmp_base/pod_name/ +/ + string pod_dir = tmp_base.buildPath(zip_basename); + try { + if (exists(pod_dir)) + rmdirRecurse(pod_dir); + mkdirRecurse(pod_dir); + } catch (FileException ex) { + result.error_msg = "failed to create temp pod directory: " ~ ex.msg; + return result; + } + /+ ↓ extract entries +/ + /+ zip internal structure uses paths like: + pod.manifest, conf/dr_document_make, + pod/media/text/en/filename.sst, image/filename.png + but the extracted pod directory needs to look like a normal pod: + pod.manifest, conf/dr_document_make, + media/text/en/filename.sst, image/filename.png + The "pod/" prefix in zip entries for text files maps to the pod root. + +/ + /+ ↓ pre-compute canonical pod path for containment checks +/ + auto canonical_pod = (pod_dir.asNormalizedPath).array.to!string ~ "/"; + foreach (name, member; zip.directory) { + /+ skip directory entries +/ + if (name.length > 0 && name[$-1] == '/') + continue; + /+ ↓ map zip internal path to filesystem path +/ + /+ entries with "pod/" prefix: strip it so media/text/en/file.sst ends up at pod_dir/media/text/en/file.sst +/ + string entry_path = name; + if (entry_path.length > 4 && entry_path[0..4] == "pod/") { + entry_path = entry_path[4..$]; + } + string out_path = pod_dir.buildPath(entry_path); + /+ ↓ verify resolved path is within pod_dir (defense in depth) +/ + auto canonical_out = (out_path.asNormalizedPath).array.to!string; + if (canonical_out.length < canonical_pod.length + || canonical_out[0..canonical_pod.length] != canonical_pod) + { + result.error_msg = "zip entry escapes extraction directory: " ~ name; + try { rmdirRecurse(pod_dir); } catch (FileException) {} + return result; + } + /+ ↓ create parent directories +/ + string parent = out_path.dirName; + try { + if (!exists(parent)) + mkdirRecurse(parent); + } catch (FileException ex) { + result.error_msg = "failed to create directory for: " ~ name ~ " - " ~ ex.msg; + try { rmdirRecurse(pod_dir); } catch (FileException) {} + return result; + } + /+ ↓ decompress and write file +/ + try { + auto data = zip.expand(member); + std.file.write(out_path, data); + } catch (Exception ex) { + result.error_msg = "failed to extract: " ~ name ~ " - " ~ ex.msg; + try { rmdirRecurse(pod_dir); } catch (FileException) {} + return result; + } + } + /+ ↓ verify no symlinks were created (defense in depth) +/ + string symlink_err = checkForSymlinks(pod_dir); + if (symlink_err.length > 0) { + result.error_msg = symlink_err; + try { rmdirRecurse(pod_dir); } catch (FileException) {} + return result; + } + /+ ↓ verify pod.manifest exists in extracted content +/ + if (!exists(pod_dir.buildPath("pod.manifest"))) { + result.error_msg = "zip archive does not contain pod.manifest: " ~ zip_path; + try { rmdirRecurse(pod_dir); } catch (FileException) {} + return result; + } + result.tmp_dir = tmp_base; + result.pod_dir = pod_dir; + result.ok = true; + return result; + } + + /+ ↓ recursively check for symlinks in extracted directory +/ + @trusted string checkForSymlinks(string dir_path) { + try { + foreach (entry; dirEntries(dir_path, SpanMode.depth)) { + if (entry.isSymlink) { + return "symlink found in zip extraction: " ~ entry.name; + } + } + } catch (FileException ex) { + return "error checking for symlinks: " ~ ex.msg; + } + return ""; + } + + /+ ↓ clean up extracted temp directory +/ + void cleanupZipPod(ref ZipPodResult zpr) { + if (zpr.pod_dir.length > 0 && exists(zpr.pod_dir)) { + try { + rmdirRecurse(zpr.pod_dir); + } catch (FileException ex) { + stderr.writeln("WARNING: failed to clean up temp zip extraction: ", zpr.pod_dir); + } + } + zpr.ok = false; + } +} diff --git a/src/sisudoc/spine.d b/src/sisudoc/spine.d index 5d3b228..ee3bcef 100755 --- a/src/sisudoc/spine.d +++ b/src/sisudoc/spine.d @@ -77,6 +77,7 @@ import sisudoc.meta.rgx_files; import sisudoc.io_in.paths_source; import sisudoc.io_in.read_config_files; import sisudoc.io_in.read_source_files; +import sisudoc.io_in.read_zip_pod; import sisudoc.io_out.hub; mixin(import("version.txt")); mixin(import("configuration.txt")); @@ -856,6 +857,9 @@ string program_name = "spine"; auto _manifested = PathMatters!()(_opt_action, _env, ""); auto _manifests = [ _manifested ]; auto _conf_file_details = configFilePaths!()(_manifested, _env, _opt_action.config_path_set); + /+ ↓ track extracted zip pod temp directories for cleanup +/ + mixin spineExtractZipPod; + ZipPodResult[] _zip_pod_extractions; ConfComposite _siteConfig; if ( _opt_action.require_processing_files @@ -863,7 +867,16 @@ string program_name = "spine"; ) { foreach(arg; args[1..$]) { if (!(arg.match(rgx.flag_action))) { /+ cli markup source path +/ // get first input markup source file names for processing - _manifested = PathMatters!()(_opt_action, _env, arg); + string _config_arg = arg; + /+ ↓ if first non-flag arg is a zip, extract for config discovery +/ + if (arg.match(rgx_files.src_pth_zip)) { + auto _zpr = extractZipPod(arg); + if (_zpr.ok) { + _zip_pod_extractions ~= _zpr; + _config_arg = _zpr.pod_dir; + } + } + _manifested = PathMatters!()(_opt_action, _env, _config_arg); { /+ local site config +/ _conf_file_details = configFilePaths!()(_manifested, _env, _opt_action.config_path_set); auto _config_local_site_struct = readConfigSite!()(_conf_file_details, _opt_action, _cfg); @@ -1047,7 +1060,166 @@ string program_name = "spine"; _manifests ~= _manifested; } } else if (arg.match(rgx_files.src_pth_zip)) { - // fns_src ~= arg; // gather input markup source file names for processing + /+ ↓ zip pod archive: extract to temp dir, process as pod +/ + /+ check if this zip was already extracted during config discovery +/ + string _zip_pod_dir; + foreach (ref _zpr; _zip_pod_extractions) { + if (_zpr.ok && _zpr.pod_dir.length > 0 + && _zpr.pod_dir.baseName == arg.baseName.stripExtension) + { + _zip_pod_dir = _zpr.pod_dir; + break; + } + } + if (_zip_pod_dir.length == 0) { + auto _zpr = extractZipPod(arg); + if (!_zpr.ok) { + writeln("ERROR >> Processing Skipped! Zip extraction failed: ", arg, " - ", _zpr.error_msg); + } else { + _zip_pod_extractions ~= _zpr; + _zip_pod_dir = _zpr.pod_dir; + } + } + if (_zip_pod_dir.length > 0) { + /+ process extracted pod directory same as regular pod +/ + auto _zip_manifest = PodManifest!()(_opt_action, _zip_pod_dir); + if (_zip_manifest.pod_manifest_file_with_path + && _opt_action.abstraction + ) { + string pod_manifest_root_content_paths_to_markup_location_raw_; + string markup_contents_location_; + string sisudoc_txt_ = _zip_manifest.pod_manifest_file_with_path; + enforce( + exists(sisudoc_txt_)!=0, + "file not found: <<" ~ + sisudoc_txt_ ~ ">>" + ); + if (exists(sisudoc_txt_)) { + try { + import dyaml; + Node pod_manifest_yaml; + try { + pod_manifest_yaml = Loader.fromFile(sisudoc_txt_).load(); + } catch (ErrnoException ex) { + } catch (FileException ex) { + writeln("ERROR failed to read config file"); + } catch (Throwable) { + writeln("ERROR failed to read config file content, not parsed as yaml"); + } + if ("doc" in pod_manifest_yaml) { + if (pod_manifest_yaml["doc"].type.mapping + && pod_manifest_yaml["doc"].tag.match(rgx_y.yaml_tag_is_map) + ) { + if ("path" in pod_manifest_yaml["doc"]) { + if (pod_manifest_yaml["doc"]["path"].tag.match(rgx_y.yaml_tag_is_seq)) { + foreach (string _path; pod_manifest_yaml["doc"]["path"]) { + markup_contents_location_ ~= _path ~ "\n"; + pod_manifest_root_content_paths_to_markup_location_raw_ ~= + _path ~ "\n"; + } + } else if ( + pod_manifest_yaml["doc"]["path"].type.string + && pod_manifest_yaml["doc"]["path"].tag.match(rgx_y.yaml_tag_is_str) + ) { + markup_contents_location_ = pod_manifest_yaml["doc"]["path"].get!string; + pod_manifest_root_content_paths_to_markup_location_raw_ = + pod_manifest_yaml["doc"]["path"].get!string; + } + } + if ("filename" in pod_manifest_yaml["doc"]) { + if (pod_manifest_yaml["doc"]["filename"].tag.match(rgx_y.yaml_tag_is_seq)) { + foreach (string _filename; pod_manifest_yaml["doc"]["filename"]) { + if ("language" in pod_manifest_yaml["doc"]) { + if (pod_manifest_yaml["doc"]["language"].tag.match(rgx_y.yaml_tag_is_seq)) { + foreach (string _lang; pod_manifest_yaml["doc"]["language"]) { + markup_contents_location_ ~= + "media/text/" + ~ _lang ~ "/" + ~ _filename ~ "\n"; + } + } else if (pod_manifest_yaml["doc"]["language"].tag.match(rgx_y.yaml_tag_is_str) + ) { + markup_contents_location_ = + "media/text/" + ~ pod_manifest_yaml["doc"]["language"].get!string + ~ "/" ~ _filename ~ "\n"; + } else { + string _lang_default = "en"; + markup_contents_location_ ~= + "media/text/" + ~ _lang_default ~ "/" + ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n"; + } + } else { + string _lang_default = "en"; + markup_contents_location_ ~= + "media/text/" + ~ _lang_default ~ "/" + ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n"; + } + } + } else if ( + pod_manifest_yaml["doc"]["filename"].type.string + && pod_manifest_yaml["doc"]["filename"].tag.match(rgx_y.yaml_tag_is_str) + ) { + if ("language" in pod_manifest_yaml["doc"]) { + if (pod_manifest_yaml["doc"]["language"].tag.match(rgx_y.yaml_tag_is_seq)) { + foreach (string _lang; pod_manifest_yaml["doc"]["language"]) { + markup_contents_location_ ~= + "media/text/" + ~ _lang ~ "/" + ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n"; + } + } else if (pod_manifest_yaml["doc"]["language"].tag.match(rgx_y.yaml_tag_is_str)) { + markup_contents_location_ = + "media/text/" + ~ pod_manifest_yaml["doc"]["language"].get!string + ~ "/" ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n"; + } else { + string _lang_default = "en"; + markup_contents_location_ ~= + "media/text/" + ~ _lang_default ~ "/" + ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n"; + } + } else { + string _lang_default = "en"; + markup_contents_location_ ~= + "media/text/" + ~ _lang_default ~ "/" + ~ pod_manifest_yaml["doc"]["filename"].get!string ~ "\n"; + } + } + } + } + } + } catch (ErrnoException ex) { + } catch (FileException ex) { + // Handle errors + } + } else { + writeln("manifest not found: ", sisudoc_txt_); + } + auto markup_contents_locations_arr + = (cast(char[]) markup_contents_location_).split; + auto tmp_dir_ = (sisudoc_txt_).dirName.array; + foreach (markup_contents_location; markup_contents_locations_arr) { + assert(markup_contents_location.match(rgx_files.src_pth_sst_or_ssm), + "not a recognised file: <<" ~ + markup_contents_location ~ ">>" + ); + auto markup_contents_location_pth_ = (markup_contents_location).to!string; + Regex!(char) lang_rgx_ = regex(r"/(" ~ _opt_action.languages_set.join("|") ~ ")/"); + if (_opt_action.languages_set[0] == "all" + || (markup_contents_location_pth_).match(lang_rgx_) + ) { + auto _fns = (((tmp_dir_).chainPath(markup_contents_location_pth_)).array).to!string; + _manifested = PathMatters!()(_opt_action, _env, _zip_pod_dir, _fns, markup_contents_locations_arr); + _manifests ~= _manifested; + } + } + } + } } else { // anything remaining, unused arg_unrecognized ~= " " ~ arg; } @@ -1277,4 +1449,8 @@ string program_name = "spine"; } } } // else { writeln("NO METADATA CURATED"); } + /+ ↓ clean up any extracted zip pod temp directories +/ + foreach (ref _zpr; _zip_pod_extractions) { + cleanupZipPod(_zpr); + } } |
