diff options
| -rw-r--r-- | data/doc/sisu/CHANGELOG_v5 | 3 | ||||
| -rw-r--r-- | data/doc/sisu/CHANGELOG_v6 | 3 | ||||
| -rw-r--r-- | lib/sisu/v5/db_columns.rb | 2 | ||||
| -rw-r--r-- | lib/sisu/v5/db_import.rb | 26 | ||||
| -rw-r--r-- | lib/sisu/v5/db_sqltxt.rb | 28 | ||||
| -rw-r--r-- | lib/sisu/v6/db_columns.rb | 2 | ||||
| -rw-r--r-- | lib/sisu/v6/db_import.rb | 26 | ||||
| -rw-r--r-- | lib/sisu/v6/db_sqltxt.rb | 28 | 
8 files changed, 82 insertions, 36 deletions
| diff --git a/data/doc/sisu/CHANGELOG_v5 b/data/doc/sisu/CHANGELOG_v5 index 2b2a7b89..eed72ecb 100644 --- a/data/doc/sisu/CHANGELOG_v5 +++ b/data/doc/sisu/CHANGELOG_v5 @@ -38,6 +38,9 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_5.6.8.orig.tar.xz    sisu_5.6.8.orig.tar.xz    sisu_5.6.8-1.dsc +* sql, clean searchable text, update for (ao/dal) text representation, +  fix legacy action +  %% 5.6.7.orig.tar.xz (2014-09-19:37/5)  http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/sisu_5.6.7  http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/debian/sisu_5.6.7-1 diff --git a/data/doc/sisu/CHANGELOG_v6 b/data/doc/sisu/CHANGELOG_v6 index 9771fccf..7350e085 100644 --- a/data/doc/sisu/CHANGELOG_v6 +++ b/data/doc/sisu/CHANGELOG_v6 @@ -28,6 +28,9 @@ http://www.jus.uio.no/sisu/pkg/src/sisu_6.2.9.orig.tar.xz    sisu_6.2.9.orig.tar.xz    sisu_6.2.9-1.dsc +* sql, clean searchable text, update for (ao/dal) text representation, +  fix legacy action +  %% 6.2.8.orig.tar.xz (2014-09-19:37/5)  http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/sisu_6.2.8  http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=log;h=refs/tags/debian/sisu_6.2.8-1 diff --git a/lib/sisu/v5/db_columns.rb b/lib/sisu/v5/db_columns.rb index 44d45e95..15341042 100644 --- a/lib/sisu/v5/db_columns.rb +++ b/lib/sisu/v5/db_columns.rb @@ -81,7 +81,7 @@ module SiSU_DbColumns            @sisutxt=special_character_escape(src)          else @sisutxt=''          end -        @fulltext=clean_searchable_text(txt_arr) +        @fulltext=clean_searchable_text_from_document_objects(txt_arr)         else @sisutxt,@fulltext='',''        end      end diff --git a/lib/sisu/v5/db_import.rb b/lib/sisu/v5/db_import.rb index a7f33939..8a500f8a 100644 --- a/lib/sisu/v5/db_import.rb +++ b/lib/sisu/v5/db_import.rb @@ -292,7 +292,7 @@ module SiSU_DbImport          src=txt_arr.join("\n")          src=special_character_escape(src)          @tp[:sisutxt_f],@tp[:sisutxt_i]='sisutxt, ',"'#{src}', " -        txt=clean_searchable_text(txt_arr) +        txt=clean_searchable_text_from_document_source(txt_arr)          #txt=special_character_escape(txt)          @tp[:fulltxt_f],@tp[:fulltxt_i]='fulltxt, ',"'#{txt}', "        end @@ -374,9 +374,9 @@ module SiSU_DbImport                @col[:body]=special_character_escape(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext) -              @col[:plaintext]=clean_searchable_text(plaintext) +              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)                book_idx=book_idx_hash_to_str(data.idx) -              @col[:book_idx]=clean_searchable_text(book_idx) +              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)                if @en[0] then @en_a,@en_z=@en[0].first,@en[0].last                end                if @en_ast[0] then @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last @@ -425,9 +425,9 @@ module SiSU_DbImport                @col[:body]=special_character_escape(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext) -              @col[:plaintext]=clean_searchable_text(plaintext) +              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)                book_idx=book_idx_hash_to_str(data.idx) -              @col[:book_idx]=clean_searchable_text(book_idx) +              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)                @en_a,@en_z=@en[0].first,@en[0].last if @en[0]                @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]                @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -470,9 +470,9 @@ module SiSU_DbImport                @col[:body]=special_character_escape(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext) -              @col[:plaintext]=clean_searchable_text(plaintext) +              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)                book_idx=book_idx_hash_to_str(data.idx) -              @col[:book_idx]=clean_searchable_text(book_idx) +              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)                @en_a,@en_z=@en[0].first,@en[0].last if @en[0]                @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]                @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -501,9 +501,9 @@ module SiSU_DbImport                @col[:body]=special_character_escape(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext) -              @col[:plaintext]=clean_searchable_text(plaintext) +              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)                book_idx=book_idx_hash_to_str(data.idx) -              @col[:book_idx]=clean_searchable_text(book_idx) +              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)                @en_a,@en_z=@en[0].first,@en[0].last if @en[0]                @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]                @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -532,9 +532,9 @@ module SiSU_DbImport                @col[:body]=special_character_escape(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext) -              @col[:plaintext]=clean_searchable_text(plaintext) +              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)                book_idx=book_idx_hash_to_str(data.idx) -              @col[:book_idx]=clean_searchable_text(book_idx) +              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)                @en_a,@en_z=@en[0].first,@en[0].last if @en[0]                @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]                @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -603,9 +603,9 @@ module SiSU_DbImport                @col[:body]=special_character_escape(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext) -              @col[:plaintext]=clean_searchable_text(plaintext) +              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)                book_idx=book_idx_hash_to_str(data.idx) -              @col[:book_idx]=clean_searchable_text(book_idx) +              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)                t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint)                @tuple_array << t.tuple                @en,@en_ast,@en_pls=[],[],[] diff --git a/lib/sisu/v5/db_sqltxt.rb b/lib/sisu/v5/db_sqltxt.rb index 99d417e1..adb2b0f0 100644 --- a/lib/sisu/v5/db_sqltxt.rb +++ b/lib/sisu/v5/db_sqltxt.rb @@ -71,7 +71,27 @@ module SiSU_DbText          gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2').          gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1')      end -    def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source +    def clean_searchable_text_from_document_objects(arr) +      txt_arr,en=[],[] +      arr=(arr.is_a?(String)) ? [ arr ] : arr +      arr.each do |s| +        s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,''). +            gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,''). +            gsub(/<br>/m,' ') +        en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) +        s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,''). +          gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). +          gsub(/ \s+/m,' ') +        #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ +        s +      end +      txt_arr << arr << en +      #txt_arr=txt_arr.flatten +      txt=txt_arr.flatten.join("\n") +      txt=special_character_escape(txt) +      txt +    end +    def clean_searchable_text_from_document_source(arr)        txt_arr,en=[],[]        arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr        arr.each do |s| @@ -107,14 +127,14 @@ module SiSU_DbText            gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,'').            gsub(/^%{1,3} .+/m,''). #removed even if contained in code block            gsub(/<br>/m,' ') -        en << s.scan(/~\{\s*(.+?)\s*\}~/m) +        #en << s.scan(/~\{\s*(.+?)\s*\}~/m)          s=s.gsub(/~\{.+?\}~/m,'').            gsub(/ \s+/m,' ') -        #special_character_escape(s) +        ##special_character_escape(s) +        #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/          s        end        txt_arr << arr << en -      #txt_arr=txt_arr.flatten        txt=txt_arr.flatten.join("\n")        txt=special_character_escape(txt)        txt diff --git a/lib/sisu/v6/db_columns.rb b/lib/sisu/v6/db_columns.rb index 343f66e6..005c45b5 100644 --- a/lib/sisu/v6/db_columns.rb +++ b/lib/sisu/v6/db_columns.rb @@ -81,7 +81,7 @@ module SiSU_DbColumns            @sisutxt=special_character_escape(src)          else @sisutxt=''          end -        @fulltext=clean_searchable_text(txt_arr) +        @fulltext=clean_searchable_text_from_document_objects(txt_arr)         else @sisutxt,@fulltext='',''        end      end diff --git a/lib/sisu/v6/db_import.rb b/lib/sisu/v6/db_import.rb index 3e2e7271..ccffb904 100644 --- a/lib/sisu/v6/db_import.rb +++ b/lib/sisu/v6/db_import.rb @@ -292,7 +292,7 @@ module SiSU_DbImport          src=txt_arr.join("\n")          src=special_character_escape(src)          @tp[:sisutxt_f],@tp[:sisutxt_i]='sisutxt, ',"'#{src}', " -        txt=clean_searchable_text(txt_arr) +        txt=clean_searchable_text_from_document_source(txt_arr)          #txt=special_character_escape(txt)          @tp[:fulltxt_f],@tp[:fulltxt_i]='fulltxt, ',"'#{txt}', "        end @@ -374,9 +374,9 @@ module SiSU_DbImport                @col[:body]=special_character_escape(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext) -              @col[:plaintext]=clean_searchable_text(plaintext) +              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)                book_idx=book_idx_hash_to_str(data.idx) -              @col[:book_idx]=clean_searchable_text(book_idx) +              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)                if @en[0] then @en_a,@en_z=@en[0].first,@en[0].last                end                if @en_ast[0] then @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last @@ -425,9 +425,9 @@ module SiSU_DbImport                @col[:body]=special_character_escape(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext) -              @col[:plaintext]=clean_searchable_text(plaintext) +              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)                book_idx=book_idx_hash_to_str(data.idx) -              @col[:book_idx]=clean_searchable_text(book_idx) +              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)                @en_a,@en_z=@en[0].first,@en[0].last if @en[0]                @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]                @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -470,9 +470,9 @@ module SiSU_DbImport                @col[:body]=special_character_escape(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext) -              @col[:plaintext]=clean_searchable_text(plaintext) +              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)                book_idx=book_idx_hash_to_str(data.idx) -              @col[:book_idx]=clean_searchable_text(book_idx) +              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)                @en_a,@en_z=@en[0].first,@en[0].last if @en[0]                @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]                @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -501,9 +501,9 @@ module SiSU_DbImport                @col[:body]=special_character_escape(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext) -              @col[:plaintext]=clean_searchable_text(plaintext) +              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)                book_idx=book_idx_hash_to_str(data.idx) -              @col[:book_idx]=clean_searchable_text(book_idx) +              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)                @en_a,@en_z=@en[0].first,@en[0].last if @en[0]                @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]                @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -532,9 +532,9 @@ module SiSU_DbImport                @col[:body]=special_character_escape(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext) -              @col[:plaintext]=clean_searchable_text(plaintext) +              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)                book_idx=book_idx_hash_to_str(data.idx) -              @col[:book_idx]=clean_searchable_text(book_idx) +              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)                @en_a,@en_z=@en[0].first,@en[0].last if @en[0]                @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]                @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -603,9 +603,9 @@ module SiSU_DbImport                @col[:body]=special_character_escape(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext) -              @col[:plaintext]=clean_searchable_text(plaintext) +              @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext)                book_idx=book_idx_hash_to_str(data.idx) -              @col[:book_idx]=clean_searchable_text(book_idx) +              @col[:book_idx]=clean_searchable_text_from_document_objects(book_idx)                t=SiSU_DbTuple::LoadDocuments.new(@conn,@col,@opt,@file_maint)                @tuple_array << t.tuple                @en,@en_ast,@en_pls=[],[],[] diff --git a/lib/sisu/v6/db_sqltxt.rb b/lib/sisu/v6/db_sqltxt.rb index ac96df38..6d2cbb0a 100644 --- a/lib/sisu/v6/db_sqltxt.rb +++ b/lib/sisu/v6/db_sqltxt.rb @@ -71,7 +71,27 @@ module SiSU_DbText          gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2').          gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1')      end -    def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source +    def clean_searchable_text_from_document_objects(arr) +      txt_arr,en=[],[] +      arr=(arr.is_a?(String)) ? [ arr ] : arr +      arr.each do |s| +        s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,''). +            gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,''). +            gsub(/<br>/m,' ') +        en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) +        s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,''). +          gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). +          gsub(/ \s+/m,' ') +        #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ +        s +      end +      txt_arr << arr << en +      #txt_arr=txt_arr.flatten +      txt=txt_arr.flatten.join("\n") +      txt=special_character_escape(txt) +      txt +    end +    def clean_searchable_text_from_document_source(arr)        txt_arr,en=[],[]        arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr        arr.each do |s| @@ -107,14 +127,14 @@ module SiSU_DbText            gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,'').            gsub(/^%{1,3} .+/m,''). #removed even if contained in code block            gsub(/<br>/m,' ') -        en << s.scan(/~\{\s*(.+?)\s*\}~/m) +        #en << s.scan(/~\{\s*(.+?)\s*\}~/m)          s=s.gsub(/~\{.+?\}~/m,'').            gsub(/ \s+/m,' ') -        #special_character_escape(s) +        ##special_character_escape(s) +        #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/          s        end        txt_arr << arr << en -      #txt_arr=txt_arr.flatten        txt=txt_arr.flatten.join("\n")        txt=special_character_escape(txt)        txt | 
