diff options
author | Ralph Amissah <ralph@amissah.com> | 2014-10-19 21:13:52 -0400 |
---|---|---|
committer | Ralph Amissah <ralph@amissah.com> | 2014-10-19 21:13:52 -0400 |
commit | 2c73f3060f9678f751c236fe17863d443f6a650f (patch) | |
tree | 80592d406e45eb6626f6cfdc79dbe65716cb70fc | |
parent | v5 v6: html_format, "id"s for objects & footnotes (diff) |
v5 v6: db, text search & display field, footnotes moved to end of text object
* cleaner, more useful search results
* cleaner text search field
* separate footnote fields redundant for search purposes
-rw-r--r-- | data/doc/sisu/CHANGELOG_v5 | 5 | ||||
-rw-r--r-- | data/doc/sisu/CHANGELOG_v6 | 5 | ||||
-rw-r--r-- | lib/sisu/v5/db_import.rb | 34 | ||||
-rw-r--r-- | lib/sisu/v5/db_sqltxt.rb | 28 | ||||
-rw-r--r-- | lib/sisu/v6/db_import.rb | 34 | ||||
-rw-r--r-- | lib/sisu/v6/db_sqltxt.rb | 28 |
6 files changed, 88 insertions, 46 deletions
diff --git a/data/doc/sisu/CHANGELOG_v5 b/data/doc/sisu/CHANGELOG_v5 index 0271a637..39591639 100644 --- a/data/doc/sisu/CHANGELOG_v5 +++ b/data/doc/sisu/CHANGELOG_v5 @@ -42,6 +42,11 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_5.7.1.orig.tar.xz * html_format, "id"s for objects & footnotes * remove trailing backslash for empty linebreak & paragraph, <br> <p> +* db, text search & display field, footnotes moved to end of text object + * cleaner, more useful search results + * cleaner text search field + * separate footnote fields redundant for search purposes + %% 5.7.0.orig.tar.xz (2014-10-12:40/7) http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/sisu_5.7.0 http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/debian/sisu_5.7.0-1 diff --git a/data/doc/sisu/CHANGELOG_v6 b/data/doc/sisu/CHANGELOG_v6 index 005803bf..6a76425c 100644 --- a/data/doc/sisu/CHANGELOG_v6 +++ b/data/doc/sisu/CHANGELOG_v6 @@ -32,6 +32,11 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_6.3.1.orig.tar.xz * html_format, "id"s for objects & footnotes * remove trailing backslash for empty linebreak & paragraph, <br> <p> +* db, text search & display field, footnotes moved to end of text object + * cleaner, more useful search results + * cleaner text search field + * separate footnote fields redundant for search purposes + %% 6.3.0.orig.tar.xz (2014-10-12:40/7) http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/sisu_6.3.0 http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/debian/sisu_6.3.0-1 diff --git a/lib/sisu/v5/db_import.rb b/lib/sisu/v5/db_import.rb index 59cff28a..72fb3753 100644 --- a/lib/sisu/v5/db_import.rb +++ b/lib/sisu/v5/db_import.rb @@ -334,17 +334,17 @@ module SiSU_DbImport @en,@en_ast,@en_pls,@tuple_array=[],[],[],[] @col[:en_a],@col[:en_z]=nil,nil ao_array.each do |data| - data.obj.gsub!(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1') - data.obj.gsub!(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 ') - data.obj.gsub!(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check + data.obj=data.obj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1'). + gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1'). + gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1'). + gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1'). + gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1'). + gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1'). + gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1'). + gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1'). + gsub(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1'). + gsub(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 '). + gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check @col[:seg]=@@seg if data.of ==:para \ || data.of ==:heading \ @@ -374,7 +374,7 @@ module SiSU_DbImport @col[:lid]+=1 txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_minus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -425,7 +425,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -470,7 +470,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -501,7 +501,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -532,7 +532,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -603,7 +603,7 @@ module SiSU_DbImport else SiSU_FormatShared::CSS_Format.new(@md,data).norm end - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) diff --git a/lib/sisu/v5/db_sqltxt.rb b/lib/sisu/v5/db_sqltxt.rb index 6585fd66..3f6cf951 100644 --- a/lib/sisu/v5/db_sqltxt.rb +++ b/lib/sisu/v5/db_sqltxt.rb @@ -60,7 +60,7 @@ module SiSU_DbText class Prepare def special_character_escape(str) - str=str.gsub(/'/,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") + str=str.gsub(/'/m,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") gsub(/(\\)/m,'\1\1'). #ok but with warnings, double backslash on sqlite #str.gsub!(/[\\]/m,'\\x5C') #ok but with warnings, but not for sqlite #str.gsub!(/(\\)/m,'\1') #ok for sqlite not for pgsql gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/m,"<br>\n"). gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/m,''). #check @@ -80,13 +80,29 @@ module SiSU_DbText gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). gsub(/ \s+/m,' ') #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ - s + txt_arr << s end - txt_arr << arr << en - #txt_arr=txt_arr.flatten + txt_arr=txt_arr << en txt=txt_arr.flatten.join("\n") - txt=special_character_escape(txt) - txt + special_character_escape(txt) + end + def clean_document_objects_body(arr) + txt_arr,en,en_arr=[],[],[] + arr=(arr.is_a?(String)) ? [ arr ] : arr + arr.each do |s| + en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) + s=s.gsub(/#{Mx[:en_a_o]}\s*(\d+).+?#{Mx[:en_a_c]}/m,'<sup>\1</sup>'). + gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). + gsub(/ \s+/m,' ') + txt_arr << s + end + en.flatten.each do |e| + e=e.sub(/^(\d+)\s*/,'<sup>\1</sup> ') + en_arr << e + end + txt_arr=txt_arr << en_arr + txt=txt_arr.flatten.join("\n<br>") + special_character_escape(txt) end def clean_searchable_text_from_document_source(arr) txt_arr,en=[],[] diff --git a/lib/sisu/v6/db_import.rb b/lib/sisu/v6/db_import.rb index 9473863d..5e159451 100644 --- a/lib/sisu/v6/db_import.rb +++ b/lib/sisu/v6/db_import.rb @@ -334,17 +334,17 @@ module SiSU_DbImport @en,@en_ast,@en_pls,@tuple_array=[],[],[],[] @col[:en_a],@col[:en_z]=nil,nil ao_array.each do |data| - data.obj.gsub!(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1') - data.obj.gsub!(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1') - data.obj.gsub!(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 ') - data.obj.gsub!(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check + data.obj=data.obj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1'). + gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1'). + gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1'). + gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1'). + gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1'). + gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1'). + gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1'). + gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1'). + gsub(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1'). + gsub(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 '). + gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check @col[:seg]=@@seg if data.of ==:para \ || data.of ==:heading \ @@ -374,7 +374,7 @@ module SiSU_DbImport @col[:lid]+=1 txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_minus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -425,7 +425,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -470,7 +470,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -501,7 +501,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -532,7 +532,7 @@ module SiSU_DbImport @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html" txt=endnotes(txt).extract_any body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -603,7 +603,7 @@ module SiSU_DbImport else SiSU_FormatShared::CSS_Format.new(@md,data).norm end - @col[:body]=special_character_escape(body) + @col[:body]=clean_document_objects_body(body) plaintext=@col[:body].dup plaintext=strip_markup(plaintext) @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) diff --git a/lib/sisu/v6/db_sqltxt.rb b/lib/sisu/v6/db_sqltxt.rb index 2fd39fb7..2375d5ca 100644 --- a/lib/sisu/v6/db_sqltxt.rb +++ b/lib/sisu/v6/db_sqltxt.rb @@ -60,7 +60,7 @@ module SiSU_DbText class Prepare def special_character_escape(str) - str=str.gsub(/'/,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") + str=str.gsub(/'/m,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") gsub(/(\\)/m,'\1\1'). #ok but with warnings, double backslash on sqlite #str.gsub!(/[\\]/m,'\\x5C') #ok but with warnings, but not for sqlite #str.gsub!(/(\\)/m,'\1') #ok for sqlite not for pgsql gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/m,"<br>\n"). gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/m,''). #check @@ -80,13 +80,29 @@ module SiSU_DbText gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). gsub(/ \s+/m,' ') #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ - s + txt_arr << s end - txt_arr << arr << en - #txt_arr=txt_arr.flatten + txt_arr=txt_arr << en txt=txt_arr.flatten.join("\n") - txt=special_character_escape(txt) - txt + special_character_escape(txt) + end + def clean_document_objects_body(arr) + txt_arr,en,en_arr=[],[],[] + arr=(arr.is_a?(String)) ? [ arr ] : arr + arr.each do |s| + en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) + s=s.gsub(/#{Mx[:en_a_o]}\s*(\d+).+?#{Mx[:en_a_c]}/m,'<sup>\1</sup>'). + gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). + gsub(/ \s+/m,' ') + txt_arr << s + end + en.flatten.each do |e| + e=e.sub(/^(\d+)\s*/,'<sup>\1</sup> ') + en_arr << e + end + txt_arr=txt_arr << en_arr + txt=txt_arr.flatten.join("\n<br>") + special_character_escape(txt) end def clean_searchable_text_from_document_source(arr) txt_arr,en=[],[] |