[Koha-cvs] koha/misc build_marc_Tword.pl build_marc_word.pl [rel_3_0]

paul poulain paul at koha-fr.org
Fri Nov 17 13:56:37 CET 2006


CVSROOT:	/sources/koha
Module name:	koha
Branch:		rel_3_0
Changes by:	paul poulain <tipaul>	06/11/17 12:56:37

Removed files:
	misc           : build_marc_Tword.pl build_marc_word.pl 

Log message:
	removing useless scripts

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/koha/misc/build_marc_Tword.pl?cvsroot=koha&only_with_tag=rel_3_0&r1=1.3&r2=0
http://cvs.savannah.gnu.org/viewcvs/koha/misc/build_marc_word.pl?cvsroot=koha&only_with_tag=rel_3_0&r1=1.1&r2=0

Patches:
Index: build_marc_Tword.pl
===================================================================
RCS file: build_marc_Tword.pl
diff -N build_marc_Tword.pl
--- build_marc_Tword.pl	1 Jun 2005 18:55:08 -0000	1.3
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,129 +0,0 @@
-#!/usr/bin/perl -w
-#-----------------------------------
-# Script Name: build_marc_Tword.pl
-# Script Version: 0.1.0
-# Date:  2004/06/05
-
-# script to build a marc_Tword table.
-# create the table :
-# CREATE TABLE `marc_Tword` (
-#  `word` varchar(80) NOT NULL default '',
-#  `usedin` text NOT NULL,
-#  `tagsubfield` varchar(4) NOT NULL default '',
-#  PRIMARY KEY  (`word`,`tagsubfield`)
-#) TYPE=MyISAM;
-# just to test the idea of a reversed index searching.
-# reversed index for searchs on Title.
-# the marc_Tword table contains for each word & marc field/subfield, the list of biblios using it, with the title
-# reminder : the inverted index is only done to search on a "contain". For a "=" or "start by", the marc_subfield_table is perfect & correctly indexed.
-# if this POC becomes more than a POC, then I think we will have to build 1 table for each sorting (marc_Tword for title, Aword for author, Cword for callnumber...)
-
-# FIXME :
-# * indexes empty words too (it's just a proof of concept)
-# * maybe it would be OK to store only 20 char of the title.
-
-use strict;
-use locale;
-use C4::Context;
-use C4::Biblio;
-my $dbh=C4::Context->dbh;
-use Time::HiRes qw(gettimeofday);
-
-# fields & subfields to ignore
-# in real situation, we should add a marc constraint on this.
-# ideally, we should not inde isbn, as every would be different, so it makes the table very big.
-# but in this case we have to find a way to automatically search "isbn = XXX" in marc_subfield_table
-
-my %ignore_list = (
-	'001' =>1,
-	'010b'=>1,
-	'0909' => 1,
-	'090a' => 1,
-	'100' => 1,
-	'105' => 1,
-	'6069' => 1,
-	'7009' => 1,
-	'7019' => 1,
-	'7109' => 1,
-	'7129' => 1,
-	'9959' => 1,
-);
-
-my $starttime = gettimeofday;
-
-$dbh->do("delete from marc_Tword");
-
-# parse every line
-my $query="SELECT biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM marc_subfield_table left join marc_biblio on marc_biblio.bibid=marc_subfield_table.bibid left join biblio on marc_biblio.biblionumber=biblio.biblionumber where tag=?";
-my $sth=$dbh->prepare($query);
-
-for (my $looptag=0;$looptag<=999;$looptag++) {
-	print "******** SELECTING ".(sprintf "%03s",$looptag)."\n";
-	$sth->execute(sprintf "%03s",$looptag);
-	print "******** DONE \n";
-	$|=1; # flushes output
-	
-	my $sthT=$dbh->prepare("select usedin from marc_Tword where tagsubfield=? and word=?");
-	my $updateT=$dbh->prepare("update marc_Tword set usedin=? where tagsubfield=? and word=?");
-	my $insertT=$dbh->prepare("insert into marc_Tword (tagsubfield,word,usedin) values (?,?,?)");
-	my $i=0;
-	my $timeneeded;
-	# 1st version, slower, but less RAM consumming
-	# while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
-	# 	next if $ignore_list{"$tag.$subfieldcode"};
-	#     $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
-	# 	# remove useless chars in the title.
-	#     $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
-	#     my @words = split / /, $subfieldvalue;
-	# 	# and retrieve the reversed entry
-	# 	foreach my $word (@words) {
-	# 		$sthT->execute($tag.$subfieldcode,$word);
-	# 		if (my ($usedin) = $sthT->fetchrow) {
-	# 			# add the field & save it once again.
-	# 			$usedin.=",$biblionumber-$title";
-	# 			$updateT->execute($usedin,$tag.$subfieldcode,$word);
-	# 		} else {
-	# 			$insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber");
-	# 		}
-	# 	}
-	# 	$timeneeded = gettimeofday - $starttime unless ($i % 100);
-	# 	print "$i in $timeneeded s\n" unless ($i % 100);
-	# 	print ".";
-	# 	$i++;
-	# }
-	
-	# 2nd version : faster (about 100 times !), bug maybe too much RAM consumming...
-	my %largehash;
-# 	print "READING\n";
-	$timeneeded = gettimeofday - $starttime unless ($i % 30000);
-	print "READING $timeneeded s\n";
-	while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
-		next unless $subfieldvalue;
-		next if $ignore_list{$tag.$subfieldcode};
-		$subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g if $subfieldvalue;
-		# remove useless chars in the title.
-		$title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g if $title;
-		my @words = split / /, $subfieldvalue;
-		# and retrieve the reversed entry
-		foreach my $word (@words) {
-			my $localkey = $tag.$subfieldcode.'|'.uc($word);
-			$largehash{$localkey}.=",".substr($title,0,15)."-$biblionumber";
-		}
-		$timeneeded = gettimeofday - $starttime unless ($i % 30000);
-		print "$i in $timeneeded s\n" unless ($i % 30000);
-		print "." unless ($i % 500);
-		$i++;
-	}
-	$i=0;
-	print "WRITING\n";
-	foreach my $k (keys %largehash) {
-		$k =~ /(.*)\|(.*)/;
-		$insertT->execute($1,$2,$largehash{$k});
-		$timeneeded = gettimeofday - $starttime unless ($i % 30000);
-		print "$i in $timeneeded s\n" unless ($i % 30000);
-		print "." unless ($i % 500);
-		$i++;
-	}
-}
-
-$dbh->disconnect();

Index: build_marc_word.pl
===================================================================
RCS file: build_marc_word.pl
diff -N build_marc_word.pl
--- build_marc_word.pl	11 Jun 2004 15:07:48 -0000	1.1
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,114 +0,0 @@
-#!/usr/bin/perl -w
-#-----------------------------------
-# Script Name: build_marc_word.pl
-# Script Version: 0.1.0
-# Date:  2004/06/05
-# Author:  Joshua Ferraro [jmf at kados dot org]
-# Description: This script builds a new marc_word
-#  table with a reduced number of tags (only those
-#  tags that should be searched) allowing for
-#  faster and more accurate searching when used
-#  with the SearchMarc routines.  Make sure that
-#  the MARCaddword routine in Biblio.pm will index
-#  characters >= 1 char; otherwise, searches like
-#  "O'brian, Patrick" will fail as the search 
-#  routines will seperate that query into "o", 
-#  "brian", and "patrick".  (If "o" is not in the
-#  database the search will fail)
-# Usage: build_marc_word.pl
-# Revision History:
-#    0.1.0  2004/06/11:  first working version.
-#    			 Thanks to Chris Cormack
-#    			 for helping with the $data object
-#    			 and Stephen Hedges for providing
-#    			 the list of MARC tags.
-# FixMe:
-#   *Should add a few parameters like 'delete from
-#    marc_word' or make script ask user whether to
-#    perform that task ...
-#   *Add a 'status' report as the data is loaded ... 
-#-----------------------------------
-use lib '/usr/local/koha/intranet/modules/';
-use strict;
-use C4::Context;
-use C4::Biblio;
-my $dbh=C4::Context->dbh;
-
-#Here is where you name the tags that you wish to index.  If you
-# are using MARC21 this set of default tags should be fine but you
-# may need to add holdings tags specific to your library (e.g., holding
-# branch for Nelsonville is 942k but that may not be the case for your
-# library).
-my @tags=(
-
-#Tag documentation from http://lcweb.loc.gov/marc/bibliographic/ecbdhome.html
-
-"020a", # INTERNATIONAL STANDARD BOOK NUMBER
-"022a", # INTERNATIONAL STANDARD SERIAL NUMBER
-"100a",	# MAIN ENTRY--PERSONAL NAME
-"110a",	# MAIN ENTRY--CORPORATE NAME
-"110b",	#   Subordinate unit
-"110c",	#   Location of meeting
-"111a", # MAIN ENTRY--MEETING NAME
-"111c", #   Location of meeting
-"130a", # MAIN ENTRY--UNIFORM TITLE 
-"240a", # UNIFORM TITLE 
-"245a", # TITLE STATEMENT
-"245b", #   Remainder of title
-"245c", #   Statement of responsibility, etc.
-"245p", #   Name of part/section of a work
-"246a", # VARYING FORM OF TITLE
-"246b", #   Remainder of title
-"260b", # PUBLICATION, DISTRIBUTION, ETC. (IMPRINT)
-"440a", # SERIES STATEMENT/ADDED ENTRY--TITLE
-"440p", #   Name of part/section of a work
-"500a", # GENERAL NOTE
-"505t", # FORMATTED CONTENTS NOTE (t is Title)
-"511a", # PARTICIPANT OR PERFORMER NOTE
-"520a", # SUMMARY, ETC.
-"534a", # ORIGINAL VERSION NOTE 
-"534k", #   Key title of original
-"534t", #   Title statement of original
-"586a", # AWARDS NOTE
-"600a", # SUBJECT ADDED ENTRY--PERSONAL NAME 
-"610a", # SUBJECT ADDED ENTRY--CORPORATE NAME
-"611a", # SUBJECT ADDED ENTRY--MEETING NAME
-"630a", # SUBJECT ADDED ENTRY--UNIFORM TITLE
-"650a", # SUBJECT ADDED ENTRY--TOPICAL TERM
-"651a", # SUBJECT ADDED ENTRY--GEOGRAPHIC NAME
-"700a", # ADDED ENTRY--PERSONAL NAME
-"710a", # ADDED ENTRY--CORPORATE NAME
-"711a", # ADDED ENTRY--MEETING NAME
-"720a", # ADDED ENTRY--UNCONTROLLED NAME
-"730a", # ADDED ENTRY--UNIFORM TITLE
-"740a", # ADDED ENTRY--UNCONTROLLED RELATED/ANALYTICAL TITLE
-"752a", # ADDED ENTRY--HIERARCHICAL PLACE NAME
-"800a", # SERIES ADDED ENTRY--PERSONAL NAME
-"810a", # SERIES ADDED ENTRY--CORPORATE NAME
-"811a", # SERIES ADDED ENTRY--MEETING NAME
-"830a", # SERIES ADDED ENTRY--UNIFORM TITLE
-"942k"  # Holdings Branch ?? Unique to NPL??
-);
-
-#note that subfieldcode in marc_subfield_table is subfieldid in marc_word ... even
-#though there is another subfieldid in marc_subfield_table--very confusing naming conventions!
-
-#For each tag we run a search to find the necessary data for building the marc_word table
-foreach my $this_tagid(@tags) {
-	my $query="SELECT bibid,tag,tagorder,subfieldcode,subfieldorder,subfieldvalue FROM marc_subfield_table WHERE tag=? AND subfieldcode=?";
-	my $sth=$dbh->prepare($query);
-
-	my ($tag, $subfieldid);
-
-#split the tag into tag, subfield
-	if ($this_tagid =~ s/(\D+)//) {
-		$subfieldid = $1;
-		$tag = $this_tagid;
-	}
-#Then we pass this information on to MARCaddword in Biblio.pm to actually perform the import into marc_word
-	$sth->execute($tag, $subfieldid);
-	while (my $data=$sth->fetchrow_hashref()){
-		MARCaddword($dbh,$data->{'bibid'},$data->{'tag'},$data->{'tagorder'},$data->{'subfieldcode'},$data->{'subfieldorder'},$data->{'subfieldvalue'});
-	}
-}
-$dbh->disconnect();





More information about the Koha-cvs mailing list