[Koha-cvs] CVS: koha/misc build_marc_Tword.pl,1.1,1.2

Wed Jun 1 20:47:41 CEST 2005

Update of /cvsroot/koha/koha/misc
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv12257/misc

Modified Files:
	build_marc_Tword.pl 
Log Message:
new version, doing tag by tag. Should be slower, but requires a lot less memory

Index: build_marc_Tword.pl
===================================================================
RCS file: /cvsroot/koha/koha/misc/build_marc_Tword.pl,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -r1.1 -r1.2
*** build_marc_Tword.pl	27 May 2005 09:30:24 -0000	1.1
--- build_marc_Tword.pl	1 Jun 2005 18:47:38 -0000	1.2
***************
*** 55,124 ****

  # parse every line
! my $query="SELECT biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM marc_subfield_table left join marc_biblio on marc_biblio.bibid=marc_subfield_table.bibid left join biblio on marc_biblio.biblionumber=biblio.biblionumber";
  my $sth=$dbh->prepare($query);

! print "******** SELECTING \n";
! $sth->execute;
! print "******** DONE \n";
! $|=1; # flushes output
! 
! my $sthT=$dbh->prepare("select usedin from marc_Tword where tagsubfield=? and word=?");
! my $updateT=$dbh->prepare("update marc_Tword set usedin=? where tagsubfield=? and word=?");
! my $insertT=$dbh->prepare("insert into marc_Tword (tagsubfield,word,usedin) values (?,?,?)");
! my $i=0;
! my $timeneeded;
! # 1st version, slower, but less RAM consumming
! # while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
! # 	next if $ignore_list{"$tag.$subfieldcode"};
! #     $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! # 	# remove useless chars in the title.
! #     $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! #     my @words = split / /, $subfieldvalue;
! # 	# and retrieve the reversed entry
! # 	foreach my $word (@words) {
! # 		$sthT->execute($tag.$subfieldcode,$word);
! # 		if (my ($usedin) = $sthT->fetchrow) {
! # 			# add the field & save it once again.
! # 			$usedin.=",$biblionumber-$title";
! # 			$updateT->execute($usedin,$tag.$subfieldcode,$word);
! # 		} else {
! # 			$insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber");
! # 		}
! # 	}
! # 	$timeneeded = gettimeofday - $starttime unless ($i % 100);
! # 	print "$i in $timeneeded s\n" unless ($i % 100);
! # 	print ".";
! # 	$i++;
! # }
! 
! # 2nd version : faster (about 100 times !), bug maybe too much RAM consumming...
! my %largehash;
! print "READING\n";
! while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
! 	next unless $subfieldvalue;
! 	next if $ignore_list{$tag.$subfieldcode};
!     $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! 	# remove useless chars in the title.
!     $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
!     my @words = split / /, $subfieldvalue;
! 	# and retrieve the reversed entry
! 	foreach my $word (@words) {
! 		my $localkey = $tag.$subfieldcode.'|'.uc($word);
! 		$largehash{$localkey}.=",$title-$biblionumber";
  	}
- 	$timeneeded = gettimeofday - $starttime unless ($i % 30000);
- 	print "$i in $timeneeded s\n" unless ($i % 30000);
- 	print "." unless ($i % 500);
- 	$i++;
- }
- $i=0;
- print "WRITING\n";
- foreach my $k (keys %largehash) {
- 	$k =~ /(.*)\|(.*)/;
- 	$insertT->execute($1,$2,$largehash{$k});
- 	$timeneeded = gettimeofday - $starttime unless ($i % 30000);
- 	print "$i in $timeneeded s\n" unless ($i % 30000);
- 	print "." unless ($i % 500);
- 	$i++;
  }

--- 55,126 ----

  # parse every line
! my $query="SELECT biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM marc_subfield_table left join marc_biblio on marc_biblio.bibid=marc_subfield_table.bibid left join biblio on marc_biblio.biblionumber=biblio.biblionumber and tag=?";
  my $sth=$dbh->prepare($query);

! for (my $looptag=0;$looptag<=999;$looptag++) {
! 	print "******** SELECTING ".(sprintf "%03s",$looptag)."\n";
! 	$sth->execute(sprintf "%03s",$looptag);
! 	print "******** DONE \n";
! 	$|=1; # flushes output
! 	
! 	my $sthT=$dbh->prepare("select usedin from marc_Tword where tagsubfield=? and word=?");
! 	my $updateT=$dbh->prepare("update marc_Tword set usedin=? where tagsubfield=? and word=?");
! 	my $insertT=$dbh->prepare("insert into marc_Tword (tagsubfield,word,usedin) values (?,?,?)");
! 	my $i=0;
! 	my $timeneeded;
! 	# 1st version, slower, but less RAM consumming
! 	# while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
! 	# 	next if $ignore_list{"$tag.$subfieldcode"};
! 	#     $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! 	# 	# remove useless chars in the title.
! 	#     $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
! 	#     my @words = split / /, $subfieldvalue;
! 	# 	# and retrieve the reversed entry
! 	# 	foreach my $word (@words) {
! 	# 		$sthT->execute($tag.$subfieldcode,$word);
! 	# 		if (my ($usedin) = $sthT->fetchrow) {
! 	# 			# add the field & save it once again.
! 	# 			$usedin.=",$biblionumber-$title";
! 	# 			$updateT->execute($usedin,$tag.$subfieldcode,$word);
! 	# 		} else {
! 	# 			$insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber");
! 	# 		}
! 	# 	}
! 	# 	$timeneeded = gettimeofday - $starttime unless ($i % 100);
! 	# 	print "$i in $timeneeded s\n" unless ($i % 100);
! 	# 	print ".";
! 	# 	$i++;
! 	# }
! 	
! 	# 2nd version : faster (about 100 times !), bug maybe too much RAM consumming...
! 	my %largehash;
! 	print "READING\n";
! 	while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
! 		next unless $subfieldvalue;
! 		next if $ignore_list{$tag.$subfieldcode};
! 		$subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g if $subfieldvalue;
! 		# remove useless chars in the title.
! 		$title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g if $title;
! 		my @words = split / /, $subfieldvalue;
! 		# and retrieve the reversed entry
! 		foreach my $word (@words) {
! 			my $localkey = $tag.$subfieldcode.'|'.uc($word);
! 			$largehash{$localkey}.=",".substr($title,0,15)."-$biblionumber";
! 		}
! 		$timeneeded = gettimeofday - $starttime unless ($i % 30000);
! 		print "$i in $timeneeded s\n" unless ($i % 30000);
! 		print "." unless ($i % 500);
! 		$i++;
! 	}
! 	$i=0;
! 	print "WRITING\n";
! 	foreach my $k (keys %largehash) {
! 		$k =~ /(.*)\|(.*)/;
! 		$insertT->execute($1,$2,$largehash{$k});
! 		$timeneeded = gettimeofday - $starttime unless ($i % 30000);
! 		print "$i in $timeneeded s\n" unless ($i % 30000);
! 		print "." unless ($i % 500);
! 		$i++;
  	}
  }