aboutsummaryrefslogtreecommitdiffhomepage
path: root/org/default_regex.org
diff options
context:
space:
mode:
authorRalph Amissah <ralph.amissah@gmail.com>2022-11-25 22:06:40 -0500
committerRalph Amissah <ralph.amissah@gmail.com>2022-12-23 18:17:41 -0500
commitf6d28b62f0e02b8a88a1832589e203c7a613f45b (patch)
treeb5d6462e45bae998190194784e02b143a83f79a3 /org/default_regex.org
parentgitignore & things nix (diff)
regex review, match speed & compile time, ctregex
- improve match time - add interim fontface identifier marker - improve compile time - remove unused regexs - separate out some specialized output matches
Diffstat (limited to 'org/default_regex.org')
-rw-r--r--org/default_regex.org198
1 files changed, 140 insertions, 58 deletions
diff --git a/org/default_regex.org b/org/default_regex.org
index 89d6ea3..976baa0 100644
--- a/org/default_regex.org
+++ b/org/default_regex.org
@@ -67,7 +67,6 @@ static template spineRgxIn() {
<<meta_rgx_bibliography>>
<<meta_rgx_book_index_split>>
<<meta_rgx_topic_register_split>>
- <<meta_rgx_language_codes>>
<<prgmkup_rgx_spaces>>
<<prgmkup_rgx_filename_and_path>>
<<prgmkup_rgx_inline_breaks>>
@@ -86,10 +85,6 @@ static template spineRgxIn() {
/+ misc +/
static flag_action = ctRegex!(`^(--[a-z][a-z0-9-]+)$`);
static within_quotes = ctRegex!(`"(.+?)"`, "m");
-static yaml_tag_is_str = ctRegex!(`:str$`);
-static yaml_tag_is_int = ctRegex!(`:int$`);
-static yaml_tag_is_map = ctRegex!(`:map$`);
-static yaml_tag_is_seq = ctRegex!(`:seq$`);
static make_heading_delimiter = ctRegex!(`[;][ ]*`);
static arr_delimiter = ctRegex!(`[ ]*[;][ ]*`);
static name_delimiter = ctRegex!(`^([^,]+)[ ]*,[ ]+(.+?)$`);
@@ -476,8 +471,6 @@ static template spineRgxOut() {
<<prgmkup_rgx_inline_links>>
<<prgmkup_rgx_inline_font_face>>
<<prgmkup_rgx_table>>
- <<sp_ch_xhtml_rgx>>
- <<sp_ch_latex_rgx>>
<<grouped_text_rgx_paragraph_marks>>
}
}
@@ -492,35 +485,22 @@ static make_breakpage = ctRegex!(`new=(?P<breakpage>.+
static make_breakcolumn = ctRegex!(`break=(?P<breakcolumn>.+?)(?:;|$)`,);
#+END_SRC
-** special characters
-*** xhtml special characters
+* 2. ctRegex defaults shared by meta & output (generic)
-#+NAME: sp_ch_xhtml_rgx
+** meta
+
+#+NAME: prgmkup_rgx_meta
#+BEGIN_SRC d
-static xhtml_ampersand = ctRegex!(`[&]`, "m"); // &amp;
-static xhtml_quotation = ctRegex!(`["]`, "m"); // &quot;
-static xhtml_less_than = ctRegex!(`[<]`, "m"); // &lt;
-static xhtml_greater_than = ctRegex!(`[>]`, "m"); // &gt;
-static xhtml_line_break = ctRegex!(` [\\]{2}`, "m"); // <br />
+static space = ctRegex!(`[ ]`, "mg");
+static spaces_keep = ctRegex!(`(?P<keep_spaces>^[ ]+|[ ]{2,})`, "mg"); // code, verse, block
#+END_SRC
-*** latex special characters
+** spine & source_in
-#+NAME: sp_ch_latex_rgx
+#+NAME: prgmkup_rgx_in
#+BEGIN_SRC d
-static latex_special_char = ctRegex!(`([%${}_#&\\])`);
-static latex_special_char_for_escape = ctRegex!(`([%${}_#\\])`);
-static latex_special_char_for_escape_and_braces = ctRegex!(`([&])`);
-static latex_special_char_for_escape_url = ctRegex!(`([%])`);
-static latex_special_char_escaped = ctRegex!(`\\([%${}_#\\])`);
-static latex_special_char_escaped_braced = ctRegex!(`[{]\\([&])[}]`);
-static latex_identify_inline_link = ctRegex!(`┥.+?┝┤\S+?├`, "mg");
-static latex_identify_inline_fontface = ctRegex!(`\\([_#$]┨.+?┣)\\([_#$])`, "mg");
-static latex_clean_internal_link = ctRegex!(`^(?:#|¤\S+?#)`, "m");
-static latex_clean_bookindex_linebreak = ctRegex!(`\s*\\\\\\\\\s*`, "m");
#+END_SRC
-* 2. ctRegex defaults shared by meta & output (generic)
** misc generic
#+NAME: prgmkup_rgx_spaces
@@ -534,24 +514,6 @@ static nbsp_chars = ctRegex!(`[░]+`, "mg");
static middle_dot = ctRegex!(`·`, "mg");
#+END_SRC
-** filename (& path) (including insert file) :insert:file:path:filename:
-
-#+NAME: prgmkup_rgx_filename_and_path
-#+BEGIN_SRC d
-static src_pth_sst_or_ssm = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.](?P<extension>ss[tm]))$`);
-static src_pth_pod_sst_or_ssm = ctRegex!(`^(?P<podpath>[/]?(?:[a-zA-Z0-9._-]+/)*)media/text/[a-z]{2}/(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*?[.]ss[tm])$`);
-static src_pth_contents = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*)/pod[.]manifest$`);
-static src_pth_zip = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]zip)$`);
-static src_pth_types = ctRegex!(`^(?P<path>[/]?[a-zA-Z0-9._-]+/)*(?P<gotfile>(?P<filename>[a-zA-Z0-9._-]+[.]ss[tm])|(?P<filelist>[a-zA-Z0-9._-]+/pod[.]manifest)|(?P<filezip>[a-zA-Z0-9._-]+[.]zip))$`);
-static src_fn =
- ctRegex!(`^([/]?(?:[a-zA-Z0-9._-]+/)*)(?P<fn_src>(?P<fn_base>[a-zA-Z0-9._-]+)[.](?P<fn_src_suffix>ss[tm]))$`);
-static src_fn_master = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ssm)$`);
-static src_fn_find_inserts = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ss[im])$`);
-static insert_src_fn_ssi_or_sst = ctRegex!(`^<<\s*(?P<path>[a-zA-Z0-9._-]+/)*(?P<filename>[a-zA-Z0-9._-]+[.]ss[ti])$`);
-static src_base_parent_dir_name = ctRegex!(`[/](?P<dir>(?:[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure
-static src_formalised_file_path_parts = ctRegex!(`(?P<pth>(?:[/a-zA-Z0-9._-]+?)(?P<dir>[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure
-#+END_SRC
-
** inline markup
*** inline breaks
@@ -666,21 +628,21 @@ static quotation_mark_sql_insert_delimiter = ctRegex!("[']", "mg");
#+NAME: prgmkup_rgx_inline_font_face
#+BEGIN_SRC d
/+ inline markup font face mod +/
-static inline_emphasis = ctRegex!(`[*]┨(?P<text>.+?)┣[*]`, "mg");
-static inline_bold = ctRegex!(`[!]┨(?P<text>.+?)┣[!]`, "mg");
-static inline_underscore = ctRegex!(`[_]┨(?P<text>.+?)┣[_]`, "mg");
-static inline_italics = ctRegex!(`[/]┨(?P<text>.+?)┣[/]`, "mg");
-static inline_superscript = ctRegex!(`\^┨(?P<text>.+?)┣\^`, "mg");
-static inline_subscript = ctRegex!(`[,]┨(?P<text>.+?)┣[,]`, "mg");
-static inline_strike = ctRegex!(`[-]┨(?P<text>.+?)┣[-]`, "mg");
-static inline_insert = ctRegex!(`[+]┨(?P<text>.+?)┣[+]`, "mg");
-static inline_mono = ctRegex!(`[■]┨(?P<text>.+?)┣[■]`, "mg");
-static inline_cite = ctRegex!(`[‖]┨(?P<text>.+?)┣[‖]`, "mg");
+static inline_emphasis = ctRegex!(`⑆[*]┨(?P<text>.+?)┣[*]`, "mg");
+static inline_bold = ctRegex!(`⑆[!]┨(?P<text>.+?)┣[!]`, "mg");
+static inline_underscore = ctRegex!(`⑆[_]┨(?P<text>.+?)┣[_]`, "mg");
+static inline_italics = ctRegex!(`⑆[/]┨(?P<text>.+?)┣[/]`, "mg");
+static inline_superscript = ctRegex!(`⑆\^┨(?P<text>.+?)┣\^`, "mg");
+static inline_subscript = ctRegex!(`⑆[,]┨(?P<text>.+?)┣[,]`, "mg");
+static inline_strike = ctRegex!(`⑆[-]┨(?P<text>.+?)┣[-]`, "mg");
+static inline_insert = ctRegex!(`⑆[+]┨(?P<text>.+?)┣[+]`, "mg");
+static inline_mono = ctRegex!(`⑆[■]┨(?P<text>.+?)┣[■]`, "mg");
+static inline_cite = ctRegex!(`⑆[‖]┨(?P<text>.+?)┣[‖]`, "mg");
#+END_SRC
#+BEGIN_SRC d
-// static inline_superscript = ctRegex!(`[\^]┨(?P<text>.+?)┣[\^]`, "mg");
-// static inline_fontface_clean = ctRegex!(`[*!_/^,+■‖-]┨|┣[*!_/^,+■‖-]`, "mg");
+// static inline_superscript = ctRegex!(`⑆[\^]┨(?P<text>.+?)┣[\^]`, "mg");
+// static inline_fontface_clean = ctRegex!(`⑆[*!_/^,+■‖-]┨|┣[*!_/^,+■‖-]`, "mg");
#+END_SRC
*** table related
@@ -692,6 +654,126 @@ static table_delimiter_col = ctRegex!("[ ]*[┊][ ]*", "mg"
static table_delimiter_row = ctRegex!("[ ]*\n", "mg");
#+END_SRC
+** files filename (& path) (including insert file) :insert:file:path:filename:
+
+#+HEADER: :tangle "../src/doc_reform/meta/rgx_files.d"
+#+HEADER: :noweb yes
+#+BEGIN_SRC d
+<<doc_header_including_copyright_and_license>>
+/++
+ regex: regular expressions used in sisu document parser
++/
+module doc_reform.meta.rgx_files;
+static template spineRgxFiles() {
+ static struct RgxFiles {
+ <<prgmkup_rgx_filename_and_path>>
+ <<meta_rgx_language_codes>>
+ }
+}
+#+END_SRC
+
+#+NAME: prgmkup_rgx_filename_and_path
+#+BEGIN_SRC d
+static src_pth_sst_or_ssm = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.](?P<extension>ss[tm]))$`);
+static src_pth_pod_sst_or_ssm = ctRegex!(`^(?P<podpath>[/]?(?:[a-zA-Z0-9._-]+/)*)media/text/[a-z]{2}/(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*?[.]ss[tm])$`);
+static src_pth_contents = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*)/pod[.]manifest$`);
+static src_pth_zip = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]zip)$`);
+static src_pth_types = ctRegex!(`^(?P<path>[/]?[a-zA-Z0-9._-]+/)*(?P<gotfile>(?P<filename>[a-zA-Z0-9._-]+[.]ss[tm])|(?P<filelist>[a-zA-Z0-9._-]+/pod[.]manifest)|(?P<filezip>[a-zA-Z0-9._-]+[.]zip))$`);
+static src_fn = ctRegex!(`^([/]?(?:[a-zA-Z0-9._-]+/)*)(?P<fn_src>(?P<fn_base>[a-zA-Z0-9._-]+)[.](?P<fn_src_suffix>ss[tm]))$`);
+static src_fn_master = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ssm)$`);
+static src_fn_find_inserts = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ss[im])$`);
+static insert_src_fn_ssi_or_sst = ctRegex!(`^<<\s*(?P<path>[a-zA-Z0-9._-]+/)*(?P<filename>[a-zA-Z0-9._-]+[.]ss[ti])$`);
+static src_base_parent_dir_name = ctRegex!(`[/](?P<dir>(?:[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure
+static src_formalised_file_path_parts = ctRegex!(`(?P<pth>(?:[/a-zA-Z0-9._-]+?)(?P<dir>[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure
+#+END_SRC
+
+** _module template yaml tags
+
+#+HEADER: :tangle "../src/doc_reform/meta/rgx_yaml_tags.d"
+#+HEADER: :noweb yes
+#+BEGIN_SRC d
+<<doc_header_including_copyright_and_license>>
+/++
+ regex: regular expressions used in sisu document parser
++/
+module doc_reform.meta.rgx_yaml;
+static template spineRgxYamlTags() {
+ static struct RgxYaml {
+ <<meta_rgx_yaml>>
+ }
+}
+#+END_SRC
+
+#+NAME: meta_rgx_yaml
+#+BEGIN_SRC d
+static yaml_tag_is_str = ctRegex!(`:str$`);
+static yaml_tag_is_int = ctRegex!(`:int$`);
+static yaml_tag_is_map = ctRegex!(`:map$`);
+static yaml_tag_is_seq = ctRegex!(`:seq$`);
+#+END_SRC
+
+** special characters
+*** xhtml special characters template
+
+#+HEADER: :tangle "../src/doc_reform/io_out/rgx_xhtml.d"
+#+HEADER: :noweb yes
+#+BEGIN_SRC d
+<<doc_header_including_copyright_and_license>>
+/++
+ regex: regular expressions used in sisu document parser
++/
+module doc_reform.io_out.rgx_xhtml;
+static template spineRgxXHTML() {
+ static struct RgxXHTML {
+ <<sp_ch_xhtml_rgx>>
+ }
+}
+#+END_SRC
+
+*** xhtml special characters
+
+#+NAME: sp_ch_xhtml_rgx
+#+BEGIN_SRC d
+static ampersand = ctRegex!(`[&]`, "m"); // &amp;
+static quotation = ctRegex!(`["]`, "m"); // &quot;
+static less_than = ctRegex!(`[<]`, "m"); // &lt;
+static greater_than = ctRegex!(`[>]`, "m"); // &gt;
+static line_break = ctRegex!(` [\\]{2}`, "m"); // <br />
+#+END_SRC
+
+*** LaTeX special characters template
+
+#+HEADER: :tangle "../src/doc_reform/io_out/rgx_latex.d"
+#+HEADER: :noweb yes
+#+BEGIN_SRC d
+<<doc_header_including_copyright_and_license>>
+/++
+ regex: regular expressions used in sisu document parser
++/
+module doc_reform.io_out.rgx_latex;
+static template spineRgxLSC() {
+ static struct RgxLSC {
+ <<sp_ch_latex_rgx>>
+ }
+}
+#+END_SRC
+
+*** latex special characters
+
+#+NAME: sp_ch_latex_rgx
+#+BEGIN_SRC d
+static latex_special_char = ctRegex!(`([%${}_#&\\])`);
+static latex_special_char_for_escape = ctRegex!(`([%${}_#\\])`);
+static latex_special_char_for_escape_and_braces = ctRegex!(`([&])`);
+static latex_special_char_for_escape_url = ctRegex!(`([%])`);
+static latex_special_char_escaped = ctRegex!(`\\([%${}_#\\])`);
+static latex_special_char_escaped_braced = ctRegex!(`[{]\\([&])[}]`);
+static latex_identify_inline_link = ctRegex!(`┥.+?┝┤\S+?├`, "mg");
+static latex_identify_inline_fontface = ctRegex!(`\\([_#$]┨.+?┣)\\([_#$])`, "mg");
+static latex_clean_internal_link = ctRegex!(`^(?:#|¤\S+?#)`, "m");
+static latex_clean_bookindex_linebreak = ctRegex!(`\s*\\\\\\\\\s*`, "m");
+#+END_SRC
+
* document header including copyright & license
#+NAME: doc_header_including_copyright_and_license