[Koha-cvs] CVS: koha/misc build_marc_Tword.pl,1.1,1.2
Paul POULAIN
tipaul at users.sourceforge.net
Wed Jun 1 20:47:41 CEST 2005
Update of /cvsroot/koha/koha/misc
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv12257/misc
Modified Files:
build_marc_Tword.pl
Log Message:
new version, doing tag by tag. Should be slower, but requires a lot less memory
Index: build_marc_Tword.pl
===================================================================
RCS file: /cvsroot/koha/koha/misc/build_marc_Tword.pl,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -r1.1 -r1.2
*** build_marc_Tword.pl 27 May 2005 09:30:24 -0000 1.1
--- build_marc_Tword.pl 1 Jun 2005 18:47:38 -0000 1.2
***************
*** 55,124 ****
# parse every line
! my $query="SELECT biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM marc_subfield_table left join marc_biblio on marc_biblio.bibid=marc_subfield_table.bibid left join biblio on marc_biblio.biblionumber=biblio.biblionumber";
my $sth=$dbh->prepare($query);
! print "******** SELECTING \n";
! $sth->execute;
! print "******** DONE \n";
! $|=1; # flushes output
!
! my $sthT=$dbh->prepare("select usedin from marc_Tword where tagsubfield=? and word=?");
! my $updateT=$dbh->prepare("update marc_Tword set usedin=? where tagsubfield=? and word=?");
! my $insertT=$dbh->prepare("insert into marc_Tword (tagsubfield,word,usedin) values (?,?,?)");
! my $i=0;
! my $timeneeded;
! # 1st version, slower, but less RAM consumming
! # while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
! # next if $ignore_list{"$tag.$subfieldcode"};
! # $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! # # remove useless chars in the title.
! # $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! # my @words = split / /, $subfieldvalue;
! # # and retrieve the reversed entry
! # foreach my $word (@words) {
! # $sthT->execute($tag.$subfieldcode,$word);
! # if (my ($usedin) = $sthT->fetchrow) {
! # # add the field & save it once again.
! # $usedin.=",$biblionumber-$title";
! # $updateT->execute($usedin,$tag.$subfieldcode,$word);
! # } else {
! # $insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber");
! # }
! # }
! # $timeneeded = gettimeofday - $starttime unless ($i % 100);
! # print "$i in $timeneeded s\n" unless ($i % 100);
! # print ".";
! # $i++;
! # }
!
! # 2nd version : faster (about 100 times !), bug maybe too much RAM consumming...
! my %largehash;
! print "READING\n";
! while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
! next unless $subfieldvalue;
! next if $ignore_list{$tag.$subfieldcode};
! $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! # remove useless chars in the title.
! $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! my @words = split / /, $subfieldvalue;
! # and retrieve the reversed entry
! foreach my $word (@words) {
! my $localkey = $tag.$subfieldcode.'|'.uc($word);
! $largehash{$localkey}.=",$title-$biblionumber";
}
- $timeneeded = gettimeofday - $starttime unless ($i % 30000);
- print "$i in $timeneeded s\n" unless ($i % 30000);
- print "." unless ($i % 500);
- $i++;
- }
- $i=0;
- print "WRITING\n";
- foreach my $k (keys %largehash) {
- $k =~ /(.*)\|(.*)/;
- $insertT->execute($1,$2,$largehash{$k});
- $timeneeded = gettimeofday - $starttime unless ($i % 30000);
- print "$i in $timeneeded s\n" unless ($i % 30000);
- print "." unless ($i % 500);
- $i++;
}
--- 55,126 ----
# parse every line
! my $query="SELECT biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM marc_subfield_table left join marc_biblio on marc_biblio.bibid=marc_subfield_table.bibid left join biblio on marc_biblio.biblionumber=biblio.biblionumber and tag=?";
my $sth=$dbh->prepare($query);
! for (my $looptag=0;$looptag<=999;$looptag++) {
! print "******** SELECTING ".(sprintf "%03s",$looptag)."\n";
! $sth->execute(sprintf "%03s",$looptag);
! print "******** DONE \n";
! $|=1; # flushes output
!
! my $sthT=$dbh->prepare("select usedin from marc_Tword where tagsubfield=? and word=?");
! my $updateT=$dbh->prepare("update marc_Tword set usedin=? where tagsubfield=? and word=?");
! my $insertT=$dbh->prepare("insert into marc_Tword (tagsubfield,word,usedin) values (?,?,?)");
! my $i=0;
! my $timeneeded;
! # 1st version, slower, but less RAM consumming
! # while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
! # next if $ignore_list{"$tag.$subfieldcode"};
! # $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! # # remove useless chars in the title.
! # $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! # my @words = split / /, $subfieldvalue;
! # # and retrieve the reversed entry
! # foreach my $word (@words) {
! # $sthT->execute($tag.$subfieldcode,$word);
! # if (my ($usedin) = $sthT->fetchrow) {
! # # add the field & save it once again.
! # $usedin.=",$biblionumber-$title";
! # $updateT->execute($usedin,$tag.$subfieldcode,$word);
! # } else {
! # $insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber");
! # }
! # }
! # $timeneeded = gettimeofday - $starttime unless ($i % 100);
! # print "$i in $timeneeded s\n" unless ($i % 100);
! # print ".";
! # $i++;
! # }
!
! # 2nd version : faster (about 100 times !), bug maybe too much RAM consumming...
! my %largehash;
! print "READING\n";
! while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
! next unless $subfieldvalue;
! next if $ignore_list{$tag.$subfieldcode};
! $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g if $subfieldvalue;
! # remove useless chars in the title.
! $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g if $title;
! my @words = split / /, $subfieldvalue;
! # and retrieve the reversed entry
! foreach my $word (@words) {
! my $localkey = $tag.$subfieldcode.'|'.uc($word);
! $largehash{$localkey}.=",".substr($title,0,15)."-$biblionumber";
! }
! $timeneeded = gettimeofday - $starttime unless ($i % 30000);
! print "$i in $timeneeded s\n" unless ($i % 30000);
! print "." unless ($i % 500);
! $i++;
! }
! $i=0;
! print "WRITING\n";
! foreach my $k (keys %largehash) {
! $k =~ /(.*)\|(.*)/;
! $insertT->execute($1,$2,$largehash{$k});
! $timeneeded = gettimeofday - $starttime unless ($i % 30000);
! print "$i in $timeneeded s\n" unless ($i % 30000);
! print "." unless ($i % 500);
! $i++;
}
}
More information about the Koha-cvs
mailing list