#!/usr/bin/perl use strict; use warnings; use Lucene; use File::Basename; # Lucene stuff by Robin H. Johnson my $filename = "sample.out"; open(my $fh, $filename) or die "could not open $filename"; my %rawdocs; while (my $line=<$fh>) { $line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s; my $fileid = $1; # numeric or "dist" my $field = $2; # string, non-empty my $value = $3; # string, may be empty #print "Fileid: ". $fileid . "\n"; #print "field: ". $field . "\n"; #print "Value: ". $value . "\n"; if ( ! $rawdocs{$fileid} ) { $rawdocs{$fileid} = { $field => $value }; } else { $rawdocs{$fileid}{$field} = $value; } } close($fh); # Fields for indexing. # our %fields = ( # distfile => 'text', # filename => 'text', # isdist => 'UnAnalyzedField', # size => 'UnAnalyzedField', # mtime => 'UnAnalyzedField', # md5 => 'UnAnalyzedField', # sha1 => 'UnAnalyzedField', # ); # analyzer should simply tokenize filenames by its parts # i would split up by [/.-_] at least. technically, using # (\W|_|\d) as the class of split characters might be reasonable my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer(); mkdir "data"; my $store = Lucene::Store::FSDirectory->getDirectory("data", 0); my $writer = new Lucene::Index::IndexWriter($store, $analyzer, 1); $writer->setMergeFactor(100); $writer->setUseCompoundFile(0); $writer->setMaxFieldLength(2048); $writer->setMinMergeDocs(10); $writer->setMaxMergeDocs(100); # Add Documents here sub createdoc { my ($distfile, $rawdoc) = @_; my $isdist = defined($rawdoc->{isdistfile}) && $rawdoc->{isdistfile} ? 1 : 0; my $doc = new Lucene::Document; $doc->add(Lucene::Document::Field->Text("distfile", $distfile)); $doc->add(Lucene::Document::Field->Keyword("isdistfile", $isdist)); if($isdist) { for my $f (qw(origin cat pn cpv)) { $doc->add(Lucene::Document::Field->Text($f, $rawdoc->{$f})) if defined($rawdoc->{$f}); } for my $f (qw(pv pr pf)) { $doc->add(Lucene::Document::Field->Keyword($f, $rawdoc->{$f})) if defined($rawdoc->{$f}); } } else { my $name = $rawdoc->{name}; $doc->add(Lucene::Document::Field->Text("path", $name)); $doc->add(Lucene::Document::Field->Text("filename", basename($name))); $doc->add(Lucene::Document::Field->Text("directory", dirname($name))); } for my $f (qw(md5 sha1 mtime size)) { $doc->add(Lucene::Document::Field->Keyword($f, $rawdoc->{$f})) if defined($rawdoc->{$f}); } return $doc; } my $distfile = $rawdocs{dist}{name}; foreach my $f (keys(%rawdocs)) { printf "%s\n", $f; my $doc = createdoc($distfile, $rawdocs{$f}); $writer->addDocument($doc); } # End of Document adding $writer->optimize(); $writer->close; undef $writer;