1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
# coding: utf-8
=begin
* Name: SiSU
* Description: a framework for document structuring, publishing and search
#___#
* Author: Ralph Amissah
* Copyright: (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
2007, 2008, 2009, 2010 Ralph Amissah All Rights Reserved.
* License: GPL 3 or later:
SiSU, a framework for document structuring, publishing and search
Copyright (C) Ralph Amissah
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program. If not, see <http://www.gnu.org/licenses/>.
If you have Internet connection, the latest version of the GPL should be
available at these locations:
<http://www.fsf.org/licensing/licenses/gpl.html>
<http://www.gnu.org/copyleft/gpl.html>
<http://www.jus.uio.no/sisu/gpl.fsf/toc.html>
<http://www.jus.uio.no/sisu/gpl.fsf/doc.html>
<http://www.jus.uio.no/sisu/gpl.fsf/plain.txt>
* SiSU uses:
* Standard SiSU markup syntax,
* Standard SiSU meta-markup syntax, and the
* Standard SiSU object citation numbering and system
* Hompages:
<http://www.jus.uio.no/sisu>
<http://www.sisudoc.org>
* Download:
<http://www.jus.uio.no/sisu/SiSU/download.html>
* Ralph Amissah
<ralph@amissah.com>
<ralph.amissah@gmail.com>
** Description: system environment, resource control and configuration details
=end
module SiSU_DB_text
class Prepare
def special_character_escape(str)
str.gsub!(/'/,"''") #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'")
str.gsub!(/#{Mx[:br_line]}|#{Mx[:br_nl]}/,"<br />\n")
str.gsub!(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check
str.gsub!(/#{Mx[:lnk_o]}\s*(\S+?\.(?:png|jpg))(?:\s+\d+x\d+)?(.+?)#{Mx[:lnk_c]}\S+/,'[image: \1] \2')
str.gsub!(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/,'\1\2')
str.gsub!(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,'\1')
str
end
def clean_searchable_text(arr) #produce clean, searchable, plaintext from document source
txt_arr,en=[],[]
arr.each do |s|
s.gsub!(/([*\/_-])\{(.+?)\}\1/,'\2')
s.gsub!(/^(?:group|poem|code)\{/,''); s.gsub!(/^\}(?:group|poem|code)/,'')
s.gsub!(/\A(?:@\S+:\s+.+)\Z/m,'')
if s =~/^:A~/
s.gsub!(/@author/,@md.creator.author)
s.gsub!(/@title/,@md.title.full)
end
s.gsub!(/^(?:_[1-9]\*?|_\*)\s+/,'')
s.gsub!(/^(?:[1-9]\~(\S+)?)\s+/,'')
s.gsub!(/^(?::?[A-C]\~(\S+)?)\s+/,'')
s.gsub!(/^%{1,3} .+/,'') #removed even if contained in code block
s.gsub!(/<br>/,' ')
en << s.scan(/~\{\s*(.+?)\s*\}~/)
s.gsub!(/~\{.+?\}~/,'')
s.gsub!(/ \s+/,' ')
#special_character_escape(s)
s
end
txt_arr << arr << en
#txt_arr=txt_arr.flatten
txt=txt_arr.flatten.join("\n")
txt=special_character_escape(txt)
txt
end
def strip_markup(str) #define rules, make same as in dal clean
str.gsub!(/#{Mx[:fa_superscript_o]}(\d+)#{Mx[:fa_superscript_c]}/,'[\1]')
str.gsub!(/(?: \\;|#{Mx[:nbsp]})+/,' ')
str.gsub!(/#{Mx[:tc_o]}#{Mx[:tc_p]}#{Mx[:tc_p]}\d+(.+)#{Mx[:tc_c]}/u,'\1') #tables
str.gsub!(/#{Mx[:tc_p]}#{Mx[:tc_p]}\d+#{Mx[:tc_p]}/u,' ') #tables
str.gsub!(/#{Mx[:tc_p]}/u,' ') #tables tidy later
str.gsub!(/<.+?>/,'')
str.gsub!(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}(?:file|ftp)\/\/:\S+ /,' [image] ') # else image names found in search
str.gsub!(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,' [image]') # else image names found in search
str.gsub!(/\s\s+/,' ')
str.strip!
str
end
end
end
__END__
|