# ------------------------------------------------------------------------ # HtDig.pm: Perl Module for interface to Ht:/Dig databases & stuff. # Ht:/Dig is a free Intranet search engine http://www.htdig.org/ # Version: 0.1b # Author: heddy Boubaker # Revisions: # 05 Feb 1999 BirthDate # ------------------------------------------------------------------------ #use diagnostics -verbose; #use strict; require 5.00502; use Carp; use BerkeleyDB; package HtDig; # Configurable vars $home = '/www/Tools/Htdig'; # ------------------------------------------------------------------------ package HtDig::Config; # Interface to the htdig config # Configurable vars $config_dir = $HtDig::home . '/conf'; # Some default values $common_dir = $HtDig::home . '/common'; $create_images_list = 0; $create_url_list = 0; $database_dir = $HtDig::home . '/db'; $database_base = $database_dir . '/db'; $doc_db = $database_base . '.docdb'; $doc_index = $database_base . '.docs.index'; $doc_list = $database_base . '.docs'; $image_list = $database_base . '.images'; $server_aliases = ''; $start_url = 'http://www/'; $synonym_dictionary = $common_dir . '/synonyms'; $synonym_db = $common_dir . '/synonyms.db'; $url_list = $database_base . '.urls'; $word_db = $database_base . '.words.db'; $word_list = $database_base . '.wordlist'; # The configuration %config = ( 'common_dir' => $common_dir, 'config_dir' => $config_dir, 'create_url_list' => $create_url_list, 'create_images_list' => $create_images_list, 'database_dir' => $database_dir, 'database_base' => $database_base, 'doc_db' => $doc_db, 'doc_index' => $doc_index, 'doc_list' => $doc_list, 'image_list' => $image_list, 'server_aliases' => $server_aliases, 'start_url' => $start_url, 'synonym_dictionary' => $synonym_dictionary, 'synonym_db' => $synonym_db, 'url_list' => $url_list, 'word_db' => $word_db, 'word_list' => $word_list, ); ### # parse( ); # TODO ### #sub parse ( $ ) { # my $configfile = shift; #} # end parse(); # ------------------------------------------------------------------------ package HtDig::DocDB; # Interface to the docdb database # Configurable vars $db_file = $HtDig::Config::config{'doc_db'}; # The current record that could be accessed in # $func prom within process() %record = {}; ### # process( ); ### sub process ( &$ ) { my ( $func, $file ) = @_; my ( %db, $key, $value ); tie( %db, 'BerkeleyDB::Btree', -Filename => $file, -Flags => DB_RDONLY ) || die "Error: $file - $!"; while (( $key, $value ) = each %db ) { next if $key =~ /^nextDocID/; &_parse_value( $key, $value ); #print "Eval w/ $key\n"; eval &$func; } } # end process(); sub _parse_value ($$) { my ( $key, $value ) = @_; my ( $length, $count, $result, $what ); # reset record $record = {}; while ( length( $value ) > 0 ) { $what = unpack("C", $value); $value = substr($value, 1); if ($what == 0) { # ID $record{"ID"} = unpack("i", $value); $value = substr($value, 4); } elsif ($what == 1) { # TIME $record{"TIME"} = unpack("i", $value); $value = substr($value, 4); } elsif ($what == 2) { # ACCESSED $record{"ACCESSED"} = unpack("i", $value); $value = substr($value, 4); } elsif ($what == 3) { # STATE $record{"STATE"} = unpack("i", $value); $value = substr($value, 4); } elsif ($what == 4) { # SIZE $record{"SIZE"} = unpack("i", $value); $value = substr($value, 4); } elsif ($what == 5) { # LINKS $record{"LINKS"} = unpack("i", $value); $value = substr($value, 4); } elsif ($what == 6) { # IMAGESIZE $record{"IMAGESIZE"} = unpack("i", $value); $value = substr($value, 4); } elsif ($what == 7) { # HOPCOUNT $record{"HOPCOUNT"} = unpack("i", $value); $value = substr($value, 4); } elsif ($what == 8) { # URL $length = unpack("i", $value); $record{"URL"} = unpack("x4 A$length", $value); $value = substr($value, 4 + $length); } elsif ($what == 9) { # HEAD $length = unpack("i", $value); $record{"HEAD"} = unpack("x4 A$length", $value); $value = substr($value, 4 + $length); } elsif ($what == 10) { # TITLE $length = unpack("i", $value); $record{"TITLE"} = unpack("x4 A$length", $value); $value = substr($value, 4 + $length); } elsif ($what == 11) { # DESCRIPTIONS $count = unpack("i", $value); $value = substr($value, 4); $result = ""; foreach (1 .. $count) { $length = unpack("i", $value); $result = $result . unpack("x4 A$length", $value) . " "; $value = substr($value, 4 + $length); } chop $result; $record{"DESCRIPTIONS"} = $result; } elsif ($what == 12) { # ANCHORS $count = unpack("i", $value); $value = substr($value, 4); $result = ""; foreach (1 .. $count) { $length = unpack("i", $value); $result = $result . unpack("x4 A$length", $value) . " "; $value = substr($value, 4 + $length); } chop $result; $record{"ANCHORS"} = $result; } elsif ($what == 13) { # EMAIL $length = unpack("i", $value); $record{"EMAIL"} = unpack("x4 A$length", $value); $value = substr($value, 4 + $length); } elsif ($what == 14) { # NOTIFICATION $length = unpack("i", $value); $record{"NOTIFICATION"} = unpack("x4 A$length", $value); $value = substr($value, 4 + $length); } elsif ($what == 15) { # SUBJECT $length = unpack("i", $value); $record{"SUBJECT"} = unpack("x4 A$length", $value); $value = substr($value, 4 + $length); } elsif ($what == 16) { # STRING (ignore, but unpack) $length = unpack("i", $value); $record{"STRING"} = unpack("x4 A$length", $value); $value = substr($value, 4 + $length); } elsif ($what == 17) { # METADSC $length = unpack("i", $value); $record{"METADSC"} = unpack("x4 A$length", $value); $value = substr($value, 4 + $length); } elsif ($what == 18) { # BACKLINKS $record{"BACKLINKS"} = unpack("i", $value); $value = substr($value, 4); } elsif ($what == 19) { # SIGNATURE $record{"SIG"} = unpack("i", $value); $value = substr($value, 4); } } } # ------------------------------------------------------------------------ package HtDig::WordDB; # Interface to the worddb database # Configurable vars $db_file = $HtDig::Config::config{'word_db'}; # The current record that could be accessed in # $func prom within process() %record = {}; ### # process( ); ### sub process ( &$ ) { my ( $func, $file ) = @_; my ( %db, $key, $value ); tie( %db, 'BerkeleyDB::Btree', -Filename => $file, -Flags => DB_RDONLY ) || die "Error: $file - $!"; while (( $key, $value ) = each %db ) { #next if $key =~ /^nextDocID/; &_parse_value( $key, $value ); #print "Eval w/ $key\n"; eval &$func; } } # end process(); sub _parse_value ($$) { my ( $key, $value ) = @_; my ( $length, $count, $id, $weight, $anchor, $location ); # reset record $record = {}; $record{'WORD'} = $key; # extracted from wordfreq.pl $length = length( $value ) / 20; $record{'TOTAL'} = 0; $record{'NDOCS'} = 0; $record{'DOCS'} = []; foreach $i ( 0 .. $length - 1 ) { ($count, $id, $weight, $anchor, $location ) = unpack("i i i i i", substr( $value, $i * 20, 20 )); $record{'TOTAL'} += $count; $record{'NDOCS'}++; $record{'DOCS'}[$i] = { 'ID' => $id, 'WEIGHT' => $weight, 'ANCHOR' => $anchor, 'LOCATION' => $location }; } } # In case of required... 1; # HtDig.pm ends here ------------------------------------------------