diff options
author | Ralph Amissah <ralph@amissah.com> | 2008-12-02 23:54:23 -0500 |
---|---|---|
committer | Ralph Amissah <ralph@amissah.com> | 2008-12-02 23:54:23 -0500 |
commit | 0e6fc15ada3c5d9a86b227163f35a54993b32529 (patch) | |
tree | 90ac98f2dadf8a2731fac4921fb5d9263eeedeb9 /lib/sisu/v0/harvest_authors.rb | |
parent | sha256 for 0.69.4 (diff) |
sisu harvest, introduce module along with header syntax addition & modification
* sisu markup, additional header and new format rule:
* @creator: / @author: header field, introduced author name format rules
for more usable metadata harvesting: surname comma other names, additional
authors separated by semi-colon
* param added meta-tag, @topic_register: formatting topic levels are
separated from sub-levels by a colon, a semi-colon separates main topics
if there are multiple topics at lowest sub-level, a pipe can be used to
create multiple headings
* harvest module, harvests metadata from document set currently extracts: (i)
authors and their writings from document set; (ii) topics and associated
writings from document set (topics use topic_register header). harvest
(when run against documents common to a directory of a site) extracts
metadata and organises the documents on a site by author and topic
information provided (there is a new "topic_register" header, with
formatting rules similar to those of the book index), results are placed in
[output_path]/sisu_site_metadata.
sisu --harvest *.sst
* by author (see change in param @creator: / @author: header field)
* by topic / subject index (see addition in param of @topic_register:
header field)
initially there should be an example samples here:
http://www.jus.uio.no/sisu/sisu_site_metadata/harvest_authors.html
http://www.jus.uio.no/sisu/sisu_site_metadata/harvest_topics.html
together with update markup source files
The authors and their writings list will be made to take on a more
biblographical form, with the use of additional fields as required.
(concept example, suitable for medium sized sites [to remove size
constraint: implement SQL equivalent]) make feature more robust
* css, for harvest output added
* remote placement of sisu_site_metadata (output produced by metadata harvest)
* sisu markup, update document samples accordingly
* tidy copyright marks in program headers, remove repetition of dates
[version bump because formatting rule introduced to author / creator header -
where new site metadata harvest feature is used, (at present changes changes
should not be noticed except when using metadata harvest)]
Diffstat (limited to 'lib/sisu/v0/harvest_authors.rb')
-rw-r--r-- | lib/sisu/v0/harvest_authors.rb | 288 |
1 files changed, 288 insertions, 0 deletions
diff --git a/lib/sisu/v0/harvest_authors.rb b/lib/sisu/v0/harvest_authors.rb new file mode 100644 index 00000000..7a5e1cea --- /dev/null +++ b/lib/sisu/v0/harvest_authors.rb @@ -0,0 +1,288 @@ +# coding: utf-8 +=begin + + * Name: SiSU + + * Description: a framework for document structuring, publishing and search + metadata harvest, extract authors and their writings from document set + + * Author: Ralph Amissah + + * Copyright: (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, + 2007, 2008 Ralph Amissah All Rights Reserved. + + * License: GPL 3 or later: + + SiSU, a framework for document structuring, publishing and search + + Copyright (C) Ralph Amissah + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation, either version 3 of the License, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program. If not, see <http://www.gnu.org/licenses/>. + + If you have Internet connection, the latest version of the GPL should be + available at these locations: + <http://www.fsf.org/licensing/licenses/gpl.html> + <http://www.gnu.org/licenses/gpl.html> + + <http://www.jus.uio.no/sisu/gpl.fsf/toc.html> + <http://www.jus.uio.no/sisu/gpl.fsf/doc.html> + <http://www.jus.uio.no/sisu/gpl.fsf/plain.txt> + + * SiSU uses: + * Standard SiSU markup syntax, + * Standard SiSU meta-markup syntax, and the + * Standard SiSU object citation numbering and system + + * Hompages: + <http://www.jus.uio.no/sisu> + <http://www.sisudoc.org> + + * Download: + <http://www.jus.uio.no/sisu/SiSU/download.html> + + * Ralph Amissah + <ralph@amissah.com> + <ralph.amissah@gmail.com> + + ** Description: simple xml representation (sax style) + +=end +module HARVEST_authors + require "#{SiSU_lib}/author_format" + @@the_idx_authors=[] + class Songsheet + def initialize(opt) + @opt=opt + @file_list=opt.files + @env=SiSU_Env::Info_env.new + end + def songsheet + files,idx_array=[],[] + @file_list.each do |f| + if f =~/.+?\.ss[tm]$/ + files << f[/(.+?\.ss[tm])$/,1] + else + print "not .sst or .ssm ? << #{f} >> " + end + end + files.each do |filename| + file_array=[] + File.open(filename,'r') do |file| + file.each_line("\n\n") do |line| + if line =~/^@\S+?: / + file_array << line + elsif line =~/^(?:\s*\n|%+ )/ + else break + end + end + end + idx_array=HARVEST_authors::Harvest.new(file_array,filename,idx_array).extract_harvest + end + the_idx=HARVEST_authors::Index.new(idx_array,@@the_idx_authors).construct_book_author_index + #HARVEST_authors::Output_index.new(the_idx).screen_print.cycle + HARVEST_authors::Output_index.new(@opt,the_idx).html_print.html_songsheet + puts "file://#{@env.path.output_md_harvest}/harvest_authors.html" + puts "file://#{@env.path.pwd}/harvest_authors.html" if @opt.cmd.inspect =~/-M/ + end + end + class Harvest + def initialize(data,filename,idx_array) + @data,@filename,@idx_array=data,filename,idx_array + end + def extract_harvest + data,filename,idx_array=@data,@filename,@idx_array + @publication_details,@title,@subtitle,@fulltitle,@author,@author_format=nil,nil,nil,nil,nil,nil + @authors=[] + rgx={} + rgx[:author]=/^@(?:author|creator):\s+(.+)/ + rgx[:title]=/^@title:\s+(.+)/ + rgx[:subtitle]=/^@subtitle:\s+(.+)/ + rgx[:publication_details]=/^@original_publication_details:\s+(.+)/ + data.each do |para| + if para=~ rgx[:publication_details] + @publication_details=rgx[:publication_details].match(para)[1] + end + if para=~ rgx[:title] + @title=rgx[:title].match(para)[1] + end + if para=~ rgx[:subtitle] + @subtitle=rgx[:subtitle].match(para)[1] + end + if para=~ rgx[:author] + @author_format=rgx[:author].match(para)[1] + end + break if @title and @subtitle and @author and @publication_details + end + @fulltitle=if @subtitle + @title + ' - ' + @subtitle + else @title + end + if @title and @author_format #and @publication_details + creator=FORMAT::Author.new(@author_format.strip).author_details + @authors,@authorship=creator[:authors],creator[:authorship] + file=filename.sub(/\.ss[mt]$/,'') + idx_array <<= { :filename => filename, :file => file, :publication_details => @publication_details, :title => @fulltitle, :author => creator } + else + #p "missing author field: #@filename title: #@title; author: #@author_format; idx: #@publication_details" + end + idx_array.flatten! + idx_array + end + end + class Index + def initialize(idx_array,the_idx) + @idx_array,@the_idx=idx_array,the_idx + @@the_idx_authors=@the_idx + end + def capital(txt) + txt[0].chr.capitalize + txt[1,txt.length] + end + def construct_book_author_index + idx_array=@idx_array + idx_array.each do |idx| + idx[:author][:last_first_format_a].each do |author| + author.strip! + if @@the_idx_authors[author].class==NilClass + @@the_idx_authors[author]={:md => []} + end + @@the_idx_authors[author][:md] << { :filename => idx[:filename], :file => idx[:file], :author => idx[:author], :title => idx[:title] } + end + end + @the_idx=@@the_idx_authors + end + end + class Output_index + def initialize(opt,the_idx) + @opt,@the_idx=opt,the_idx + @env=SiSU_Env::Info_env.new + @rc=Get_init.instance.yamlrc + @page='sisu_manifest.html' + @output={} + @output[:html]=File.new("#{@env.path.output_md_harvest}/harvest_authors.html",'w') + @output[:html_mnt]= if @opt.cmd.inspect =~/-M/ + File.new("#{@env.path.pwd}/harvest_authors.html",'w') + else nil + end + @alph=%W[9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z] + @letter=@alph.shift + end + def html_print + def html_songsheet + html_head + html_alph + html_body + html_tail + end + def html_head_adjust(type='') + css_path=if type !~/maintenance/ + '../_sisu/css/harvest.css' + else 'harvest.css' + end + <<WOK +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head> +<title>SiSU Metadata Harvest - Authors</title> +<link rel="stylesheet" href="#{css_path}" type="text/css" /> +</head> +<body bgcolor="#ffffff" text="#000000" link="#003090" lang="en" xml:lang="en"> +<h1>SiSU Metadata Harvest - Authors</h1> +<p>also see <a href="harvest_topics.html">SiSU Metadata Harvest - Topics</a></p> +<hr /> +WOK + end + def html_head + @output[:html_mnt] << html_head_adjust('maintenance') if @opt.cmd.inspect =~/-M/ + @output[:html] << html_head_adjust + end + def html_alph + a=[] + a << '<p>' + @alph.each do |x| + a << if x =~/[0-9]/; '' + else + %{<a href="##{x}">#{x}</a>, } + end + end + @output[:html_mnt] << a.join if @output[:html_mnt].class == File + @output[:html] << a.join + end + def html_tail + a=[] + a <<<<WOK +</body> +</html> +WOK + @output[:html_mnt] << a if @output[:html_mnt].class == File + @output[:html] << a + end + def do_html(html) + @output[:html_mnt] << html if @output[:html_mnt].class == File + @output[:html] << html + end + def do_string(attrib,string) + html=%{<p class="#{attrib}">#{string}</p>} + do_html(html) + end + def do_string_name(attrib,string) + f=/^(\S)/.match(string[0])[1] + if @letter < f + while @letter < f + if @alph.length > 0 + @letter=@alph.shift + if @output[:html_mnt].class == File + @output[:html_mnt] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>} + end + @output[:html] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>} + else break + end + end + end + end + def html_body + the_idx=@the_idx + the_idx.sort.each do |a| + do_string_name('',a) + name=a[0].sub(/(.+?)(?:,.+|$)/,'\1').gsub(/\s+/,'_') + x = %{<p class="author"><a name="#{name}">#{a[0]}</a></p>} + if @output[:html_mnt].class == File + @output[:html_mnt] << x + end + @output[:html] << x + a[1][:md].each do |x| + if @output[:html_mnt].class == File + @output[:html_mnt] << %{<p class="publication">[<a href="#{x[:file]}.sst">src</a>] <a href="file://#{@env.path.output}/#{x[:file]}/#{@page}">#{x[:title]}</a>, #{x[:author][:authors_s]} -- [<a href="#{x[:file]}.sst">#{x[:file]}.sst</a>]</p>} + end + @output[:html] << %{<p class="publication"><a href="../#{x[:file]}/#{@page}">#{x[:title]}</a>, #{x[:author][:authors_s]}</p>} + end + end + end + self + end + def screen_print + def cycle + the_idx=@the_idx + the_idx.sort.each do |a| + puts a[0] + a[1][:md].each do |x| + puts "\t" + x[:file] + end + end + end + self + end + end +end +__END__ |