From e973365c4b74be2b2cff9be970ccba5928dbe368 Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Wed, 22 May 2019 10:50:33 -0400 Subject: 0.7.3 start to look at document harvest (initial stub) --- src/doc_reform/meta/conf_make_meta_json.d | 215 ++++++++++++++++----------- src/doc_reform/meta/conf_make_meta_structs.d | 3 + src/doc_reform/meta/metadoc.d | 1 + src/doc_reform/meta/metadoc_harvest.d | 30 ++++ src/doc_reform/meta/metadoc_summary.d | 4 +- src/doc_reform/meta/metadochead.d | 84 ----------- src/doc_reform/meta/rgx.d | 7 +- 7 files changed, 170 insertions(+), 174 deletions(-) create mode 100644 src/doc_reform/meta/metadoc_harvest.d delete mode 100644 src/doc_reform/meta/metadochead.d (limited to 'src/doc_reform/meta') diff --git a/src/doc_reform/meta/conf_make_meta_json.d b/src/doc_reform/meta/conf_make_meta_json.d index fcd52c1..5fd4499 100644 --- a/src/doc_reform/meta/conf_make_meta_json.d +++ b/src/doc_reform/meta/conf_make_meta_json.d @@ -5,6 +5,8 @@ module doc_reform.meta.conf_make_meta_json; static template contentJSONtoDocReformStruct() { import + std.algorithm, + std.array, std.exception, std.regex, std.stdio, @@ -16,6 +18,7 @@ static template contentJSONtoDocReformStruct() { import doc_reform.meta.conf_make_meta_structs, doc_reform.meta.conf_make_meta_json, + doc_reform.meta.defaults, doc_reform.meta.rgx; ConfCompositePlus _struct_composite; auto contentJSONtoDocReformStruct(C,J)(C _struct_composite, J _json, string _identifier) { @@ -361,6 +364,108 @@ static template contentJSONtoDocReformStruct() { } } /+ meta ------------------------------------------------------------------- +/ + if (_struct_composite.meta.creator_author.empty) { + if ("creator" in _json.object) { + if ("author" in _json.object["creator"] + && (_json.object["creator"]["author"].type().to!string == "string") + ) { + _struct_composite.meta.creator_author = _json.object["creator"]["author"].str; + } + if ("email" in _json.object["creator"] + && (_json.object["creator"]["email"].type().to!string == "string") + ) { + _struct_composite.meta.creator_author_email = _json.object["creator"]["email"].str; + } + if ("illustrator" in _json.object["creator"] + && (_json.object["creator"]["illustrator"].type().to!string == "string") + ) { + _struct_composite.meta.creator_illustrator = _json.object["creator"]["illustrator"].str; + } + if ("translator" in _json.object["creator"] + && (_json.object["creator"]["translator"].type().to!string == "string") + ) { + _struct_composite.meta.creator_translator = _json.object["creator"]["translator"].str; + } + } + string[] authors_arr; + string[][string] authors_hash_arr = [ "first" : [], "last" : [], "full" : [], "last_first" : [], "as_input" : [] ]; + string[] authors_raw_arr + = _struct_composite.meta.creator_author.split(rgx.arr_delimiter); + auto _lastname = appender!(char[])(); + foreach (author_raw; authors_raw_arr) { + authors_arr ~= author_raw.replace(rgx.raw_author_munge, "$2 $1"); + authors_hash_arr["first"] ~= author_raw.replace(rgx.raw_author_munge, "$2"); + authors_hash_arr["last"] ~= author_raw.replace(rgx.raw_author_munge, "$1"); + authors_hash_arr["full"] ~= author_raw.replace(rgx.raw_author_munge, "$2 $1"); + authors_hash_arr["as_input"] ~= author_raw; + if (auto m = author_raw.match(rgx.raw_author_munge)) { + (m.captures[1]).map!toUpper.copy(_lastname); + authors_hash_arr["last_first"] ~= _lastname.data.to!string ~ ", " ~ m.captures[2]; + _lastname = appender!(char[])(); + } + } + _struct_composite.meta.creator_author = authors_arr.join(", ").chomp.chomp; + string _author_name_last_first = authors_hash_arr["last_first"].join("; ").chomp.chomp; + _struct_composite.meta.creator_author_surname_fn = (_author_name_last_first.length > 0) + ? _author_name_last_first + : authors_hash_arr["as_input"].join("; ").chomp.chomp; + } + if (_struct_composite.meta.title_main.empty) { + if ("title" in _json.object) { + if ((_json.object["title"].type().to!string) == "string") { + _struct_composite.meta.title_main = _json.object["title"].str; + } else { + if ("edition" in _json.object["title"] + && (_json.object["title"]["edition"].type().to!string == "string") + ) { + _struct_composite.meta.title_edition = _json.object["title"]["edition"].str; + } + if ("full" in _json.object["title"] + && (_json.object["title"]["full"].type().to!string == "string") + ) {} + if ("language" in _json.object["title"] + && (_json.object["title"]["language"].type().to!string == "string") + ) { + _struct_composite.meta.title_language = _json.object["title"]["language"].str; + } + if ("main" in _json.object["title"] + && (_json.object["title"]["main"].type().to!string == "string") + ) { + _struct_composite.meta.title_main = _json.object["title"]["main"].str; + } else if ("title" in _json.object["title"] + && (_json.object["title"]["title"].type().to!string == "string") + ) { + _struct_composite.meta.title_main = _json.object["title"]["title"].str; + } + if ("note" in _json.object["title"] + && (_json.object["title"]["note"].type().to!string == "string") + ) { + _struct_composite.meta.title_note = _json.object["title"]["note"].str; + } + if ("sub" in _json.object["title"] + && (_json.object["title"]["sub"].type().to!string == "string") + ) { + _struct_composite.meta.title_sub = _json.object["title"]["sub"].str; + } + if ("subtitle" in _json.object["title"] + && (_json.object["title"]["subtitle"].type().to!string == "string") + ) { + _struct_composite.meta.title_subtitle = _json.object["title"]["subtitle"].str; + } + } + } + if ((!(_struct_composite.meta.title_subtitle.empty)) + && (_struct_composite.meta.title_sub.empty)) { + _struct_composite.meta.title_sub = _struct_composite.meta.title_subtitle; + } + _struct_composite.meta.title_full = (_struct_composite.meta.title_sub.empty) + ? _struct_composite.meta.title_main + : format( + "%s - %s", + _struct_composite.meta.title_main, + _struct_composite.meta.title_sub, + ); + } if ("classify" in _json.object) { if ("dewey" in _json.object["classify"] && (_json.object["classify"]["dewey"].type().to!string == "string") @@ -386,6 +491,22 @@ static template contentJSONtoDocReformStruct() { && (_json.object["classify"]["topic_register"].type().to!string == "string") ) { _struct_composite.meta.classify_topic_register = _json.object["classify"]["topic_register"].str; + string[] main_topics_ = _struct_composite.meta.classify_topic_register.split(rgx.topic_register_main_terms_split); + string[] topics; + string topics_tmp; + string[] multiple_sub_terms; + foreach (mt; main_topics_) { + topics_tmp = mt.replaceAll(rgx.topic_register_main_term_plus_rest_split, mkup.sep); + if (auto m = topics_tmp.match(rgx.topic_register_multiple_sub_terms_split)) { + multiple_sub_terms = m.captures[1].split(rgx.topic_register_sub_terms_split); + foreach (subterm; multiple_sub_terms) { + topics ~= m.captures.pre ~ mkup.sep ~ subterm; + } + } else { + topics ~= topics_tmp; + } + } + _struct_composite.meta.classify_topic_register_arr = topics; } } if ("date" in _json.object) { @@ -424,6 +545,13 @@ static template contentJSONtoDocReformStruct() { ) { _struct_composite.meta.date_valid = _json.object["date"]["valid"].str; } + _struct_composite.meta.author_date_title = format( + "%s %s \"%s\"", + _struct_composite.meta.creator_author_surname_fn, + (_struct_composite.meta.date_published.length > 0) + ? "(" ~ _struct_composite.meta.date_published ~ ")" : "", + _struct_composite.meta.title_full, + ); } if ("links" in _json.object) {} if ("notes" in _json.object) { @@ -508,93 +636,6 @@ static template contentJSONtoDocReformStruct() { _struct_composite.meta.rights_license = _json.object["rights"]["license"].str; } } - if (_struct_composite.meta.creator_author.empty) { - if ("creator" in _json.object) { - if ("author" in _json.object["creator"] - && (_json.object["creator"]["author"].type().to!string == "string") - ) { - _struct_composite.meta.creator_author = _json.object["creator"]["author"].str; - } - if ("email" in _json.object["creator"] - && (_json.object["creator"]["email"].type().to!string == "string") - ) { - _struct_composite.meta.creator_author_email = _json.object["creator"]["email"].str; - } - if ("illustrator" in _json.object["creator"] - && (_json.object["creator"]["illustrator"].type().to!string == "string") - ) { - _struct_composite.meta.creator_illustrator = _json.object["creator"]["illustrator"].str; - } - if ("translator" in _json.object["creator"] - && (_json.object["creator"]["translator"].type().to!string == "string") - ) { - _struct_composite.meta.creator_translator = _json.object["creator"]["translator"].str; - } - } - string[] authors_arr; - string[] authors_raw_arr - = _struct_composite.meta.creator_author.split(rgx.arr_delimiter); - foreach (author_raw; authors_raw_arr) { - authors_arr ~= author_raw.replace(rgx.raw_author_munge, "$2 $1"); - } - _struct_composite.meta.creator_author = join(authors_arr, ", ").chomp.chomp; - } - if (_struct_composite.meta.title_main.empty) { - if ("title" in _json.object) { - if ((_json.object["title"].type().to!string) == "string") { - _struct_composite.meta.title_main = _json.object["title"].str; - } else { - if ("edition" in _json.object["title"] - && (_json.object["title"]["edition"].type().to!string == "string") - ) { - _struct_composite.meta.title_edition = _json.object["title"]["edition"].str; - } - if ("full" in _json.object["title"] - && (_json.object["title"]["full"].type().to!string == "string") - ) {} - if ("language" in _json.object["title"] - && (_json.object["title"]["language"].type().to!string == "string") - ) { - _struct_composite.meta.title_language = _json.object["title"]["language"].str; - } - if ("main" in _json.object["title"] - && (_json.object["title"]["main"].type().to!string == "string") - ) { - _struct_composite.meta.title_main = _json.object["title"]["main"].str; - } else if ("title" in _json.object["title"] - && (_json.object["title"]["title"].type().to!string == "string") - ) { - _struct_composite.meta.title_main = _json.object["title"]["title"].str; - } - if ("note" in _json.object["title"] - && (_json.object["title"]["note"].type().to!string == "string") - ) { - _struct_composite.meta.title_note = _json.object["title"]["note"].str; - } - if ("sub" in _json.object["title"] - && (_json.object["title"]["sub"].type().to!string == "string") - ) { - _struct_composite.meta.title_sub = _json.object["title"]["sub"].str; - } - if ("subtitle" in _json.object["title"] - && (_json.object["title"]["subtitle"].type().to!string == "string") - ) { - _struct_composite.meta.title_subtitle = _json.object["title"]["subtitle"].str; - } - } - } - if ((!(_struct_composite.meta.title_subtitle.empty)) - && (_struct_composite.meta.title_sub.empty)) { - _struct_composite.meta.title_sub = _struct_composite.meta.title_subtitle; - } - _struct_composite.meta.title_full = (_struct_composite.meta.title_sub.empty) - ? _struct_composite.meta.title_main - : format( - "%s - %s", - _struct_composite.meta.title_main, - _struct_composite.meta.title_sub, - ); - } return _struct_composite; } } diff --git a/src/doc_reform/meta/conf_make_meta_structs.d b/src/doc_reform/meta/conf_make_meta_structs.d index 874e509..ff1ec76 100644 --- a/src/doc_reform/meta/conf_make_meta_structs.d +++ b/src/doc_reform/meta/conf_make_meta_structs.d @@ -181,7 +181,9 @@ struct MetaComposite { string classify_loc; string classify_subject; string classify_topic_register; + string[] classify_topic_register_arr; string creator_author; + string creator_author_surname_fn; string creator_author_email; string creator_illustrator; string creator_translator; @@ -223,6 +225,7 @@ struct MetaComposite { string title_short; string title_sub; string title_subtitle; + string author_date_title; } struct ConfComposite { MetaComposite meta; diff --git a/src/doc_reform/meta/metadoc.d b/src/doc_reform/meta/metadoc.d index d8cc19f..a4b920b 100644 --- a/src/doc_reform/meta/metadoc.d +++ b/src/doc_reform/meta/metadoc.d @@ -9,6 +9,7 @@ template DocReformAbstraction() { import doc_reform.meta, doc_reform.meta.metadoc_summary, + doc_reform.meta.metadoc_harvest, doc_reform.meta.metadoc_from_src, doc_reform.meta.conf_make_meta_structs, doc_reform.meta.conf_make_meta_toml, diff --git a/src/doc_reform/meta/metadoc_harvest.d b/src/doc_reform/meta/metadoc_harvest.d new file mode 100644 index 0000000..c3534f9 --- /dev/null +++ b/src/doc_reform/meta/metadoc_harvest.d @@ -0,0 +1,30 @@ +module doc_reform.meta.metadoc_harvest; +template DocReformMetaDocHarvest() { + auto DocReformMetaDocHarvest(T,H)( + T doc_matters, + H harvest, + ) { + import + doc_reform.meta.defaults, + doc_reform.meta.rgx; + import + std.array, + std.exception, + std.regex, + std.stdio, + std.string, + std.traits, + std.typecons, + std.uni, + std.utf, + std.conv : to; + mixin InternalMarkup; + auto markup = InlineMarkup(); + harvest.title = doc_matters.conf_make_meta.meta.title_full; + harvest.author = doc_matters.conf_make_meta.meta.creator_author; + harvest.author_date_title = doc_matters.conf_make_meta.meta.author_date_title; + harvest.date_published = doc_matters.conf_make_meta.meta.date_published; + harvest.topic_register_arr = doc_matters.conf_make_meta.meta.classify_topic_register_arr; + return harvest; + } +} diff --git a/src/doc_reform/meta/metadoc_summary.d b/src/doc_reform/meta/metadoc_summary.d index 768cebd..4beada8 100644 --- a/src/doc_reform/meta/metadoc_summary.d +++ b/src/doc_reform/meta/metadoc_summary.d @@ -1,6 +1,6 @@ module doc_reform.meta.metadoc_summary; -template DocReformAbstractionSummary() { - void DocReformAbstractionSummary(S,T)( +template DocReformMetaDocSummary() { + void DocReformMetaDocSummary(S,T)( const S doc_abstraction, T doc_matters, ) { diff --git a/src/doc_reform/meta/metadochead.d b/src/doc_reform/meta/metadochead.d deleted file mode 100644 index 05be0a8..0000000 --- a/src/doc_reform/meta/metadochead.d +++ /dev/null @@ -1,84 +0,0 @@ -module doc_reform.meta.metadochead; -template DocReformHarvestGetFromHead() { // TODO - import - std.datetime, - std.getopt, - std.file, - std.path, - std.process; - import - doc_reform.meta, - doc_reform.meta.metadoc_summary, - doc_reform.meta.metadoc_from_src, - doc_reform.meta.conf_make_meta_structs, - doc_reform.meta.conf_make_meta_toml, - doc_reform.meta.conf_make_meta_json, - doc_reform.meta.defaults, - doc_reform.meta.doc_debugs, - doc_reform.meta.rgx, - doc_reform.source.paths_source, - doc_reform.source.read_config_files, - doc_reform.source.read_source_files, - doc_reform.output.hub; - mixin DocReformRgxInit; - mixin contentJSONtoDocReformStruct; - mixin DocReformBiblio; - mixin DocReformRgxInitFlags; - mixin outputHub; - enum headBody { header, body_content, insert_file_list, image_list } - enum makeMeta { make, meta } - static auto rgx = Rgx(); - auto DocReformHarvestGetFromHead(E,O,M)( // TODO - E _env, - O _opt_action, - M _manifest - ){ - auto _config_document_struct = readConfigDoc!()(_manifest, _env); // document config file - auto _config_local_site_struct = readConfigSite!()(_manifest, _env); // local site config - ConfCompositePlus _make_and_meta_struct; - _make_and_meta_struct = configParseTOMLreturnDocReformStruct!()(_make_and_meta_struct, _config_document_struct); - _make_and_meta_struct = configParseTOMLreturnDocReformStruct!()(_make_and_meta_struct, _config_local_site_struct); - /+ ↓ read file (filename with path) +/ - /+ ↓ file tuple of header and content +/ - if ((_opt_action.debug_do) - || (_opt_action.very_verbose) - ) { - writeln("step1 commence → (get document header & body & insert file list & if needed image list)" - ); - } - auto _header_body_insertfilelist_imagelist - = DocReformRawMarkupContent!()(_opt_action, _manifest.src.path_and_fn); - static assert(!isTypeTuple!(_header_body_insertfilelist_imagelist)); - static assert(_header_body_insertfilelist_imagelist.length==4); - if ((_opt_action.debug_do) - || (_opt_action.very_verbose) - ) { - writeln("- step1 complete"); - } - debug(header_and_body) { - writeln(header); - writeln(_header_body_insertfilelist_imagelist.length); - writeln(_header_body_insertfilelist_imagelist.length[headBody.body_content][0]); - } - /+ ↓ split header into make and meta +/ - if ((_opt_action.debug_do) - || (_opt_action.very_verbose) - ) { - writeln("step2 commence → (read document header - toml, return struct)"); - } - _make_and_meta_struct = - docHeaderMakeAndMetaTupTomlExtractAndConvertToStruct!()( - _make_and_meta_struct, - _header_body_insertfilelist_imagelist[headBody.header] - ); - if ((_opt_action.debug_do) - || (_opt_action.very_verbose) - ) { - writeln("- step2 complete"); - } - - auto t = tuple(doc_matters_shared, doc_matters_abridged_collected); - static assert(t.length==2); - return t; - } -} diff --git a/src/doc_reform/meta/rgx.d b/src/doc_reform/meta/rgx.d index 373400f..544b432 100644 --- a/src/doc_reform/meta/rgx.d +++ b/src/doc_reform/meta/rgx.d @@ -7,6 +7,7 @@ static template DocReformRgxInit() { static struct Rgx { /+ misc +/ static true_dollar = ctRegex!(`\$`, "gm"); + static sep = ctRegex!(`␣`, "gm"); static flag_action = ctRegex!(`^(--[a-z][a-z0-9-]+)$`); static flag_action_str = ctRegex!(` (--[a-z][a-z0-9-]+)`); static within_quotes = ctRegex!(`"(.+?)"`, "m"); @@ -43,7 +44,7 @@ static template DocReformRgxInit() { /+ header +/ static variable_doc_title = ctRegex!(`@title`); static variable_doc_author = ctRegex!(`@author|@creator`); - static raw_author_munge = ctRegex!(`(\S.+?),\s+(.+)`,"i"); + static raw_author_munge = ctRegex!(`(?P\S.+?),\s+(?P.+)`,"i"); static toml_header_meta_title = ctRegex!(`^\s*(title\s*=\s*"|\[title\])`, "m"); /+ heading & paragraph operators +/ static heading_a = ctRegex!(`^:?[A][~] `, "m"); @@ -191,6 +192,10 @@ static template DocReformRgxInit() { static bi_main_term_plus_rest_split = ctRegex!(`\s*:\s*`); static bi_sub_terms_plus_object_number_offset_split = ctRegex!(`\s*\|\s*`); static bi_term_and_object_numbers_match = ctRegex!(`^(.+?)\+(\d+)`); + static topic_register_main_terms_split = ctRegex!(`\s*;\s*`); + static topic_register_main_term_plus_rest_split = ctRegex!(`\s*:\s*`); + static topic_register_sub_terms_split = ctRegex!(`\s*\|\s*`); + static topic_register_multiple_sub_terms_split = ctRegex!(`␣([^|␣]+(?:\|[^|␣]+)+)`); /+ language codes +/ auto language_codes = ctRegex!("(am|bg|bn|br|ca|cs|cy|da|de|el|en|eo|es|et|eu|fi|fr|ga|gl|he|hi|hr|hy|ia|is|it|ja|ko|la|lo|lt|lv|ml|mr|nl|no|nn|oc|pl|pt|pt_BR|ro|ru|sa|se|sk|sl|sq|sr|sv|ta|te|th|tk|tr|uk|ur|vi|zh)"); -- cgit v1.2.3