Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update bm25 algorithm with term matching and field normalization #190

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 33 additions & 13 deletions src/Indexer/TNTIndexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,15 @@ public function createIndex($indexName)
term_id INTEGER,
doc_id INTEGER,
field_id INTEGER,
field_len INTEGER,
position INTEGER,
hit_count INTEGER)");

$this->index->exec("CREATE TABLE IF NOT EXISTS docinfo (
doc_id INTEGER,
field_id INTEGER,
num_terms INTEGER)");

$this->index->exec("CREATE TABLE IF NOT EXISTS info (
key TEXT,
value INTEGER)");
Expand Down Expand Up @@ -455,6 +461,7 @@ public function saveToIndex($stems, $docId)
$terms = $this->saveWordlist($stems);
$this->saveDoclist($terms, $docId);
$this->saveHitList($stems, $docId, $terms);
$this->saveDocInfo($stems, $docId);
}

/**
Expand Down Expand Up @@ -538,33 +545,46 @@ public function saveDoclist($terms, $docId)

public function saveHitList($stems, $docId, $termsList)
{
return;
$fieldCounter = 0;
$fields = [];

$insert = "INSERT INTO hitlist (term_id, doc_id, field_id, position, hit_count)
VALUES (:term_id, :doc_id, :field_id, :position, :hit_count)";
$insert = "INSERT INTO hitlist (term_id, doc_id, field_id, field_len, hit_count)
VALUES (:term_id, :doc_id, :field_id, :field_len, :hit_count)";
$stmt = $this->index->prepare($insert);

foreach ($stems as $field => $terms) {
$fields[$fieldCounter] = $field;
$positionCounter = 0;
$termCounts = array_count_values($terms);
foreach ($terms as $term) {
if (isset($termsList[$term])) {
$stmt->bindValue(':term_id', $termsList[$term]['id']);
$stmt->bindValue(':doc_id', $docId);
$stmt->bindValue(':field_id', $fieldCounter);
$stmt->bindValue(':position', $positionCounter);
$stmt->bindValue(':hit_count', $termCounts[$term]);
$stmt->execute();
}
$positionCounter++;
$field_len = count($terms);
foreach ($termCounts as $term => $hitCount) {
$stmt->bindValue(':term_id', $termsList[$term]['id']);
$stmt->bindValue(':doc_id', $docId);
$stmt->bindValue(':field_id', $fieldCounter);
$stmt->bindValue(':field_len', $field_len);
$stmt->bindValue(':hit_count', $termCounts[$term]);
$stmt->execute();
}
$fieldCounter++;
}
}

public function saveDocInfo($stems, $docId)
{
$fieldCounter = 0;
foreach ($stems as $field => $terms) {
$numTerms = count($terms);

$insert = "INSERT INTO docinfo (doc_id, field_id, num_terms) VALUES (:doc, :field_id, :num_terms)";
$stmt = $this->index->prepare($insert);
$stmt->bindValue(':doc', $docId);
$stmt->bindValue(':field_id', $fieldCounter);
$stmt->bindValue(':num_terms', $numTerms);
$stmt->execute();
$fieldCounter++;
}
}

public function getWordFromWordList($word)
{
$selectStmt = $this->index->prepare("SELECT * FROM wordlist WHERE term like :keyword LIMIT 1");
Expand Down
86 changes: 77 additions & 9 deletions src/TNTSearch.php
Original file line number Diff line number Diff line change
Expand Up @@ -101,25 +101,35 @@ public function search($phrase, $numOfResults = 100)
return $this->stemmer->stem($keyword);
});

$tfWeight = 1;
$dlWeight = 0.5;
$tfWeight = 1.2;
$dlWeight = 0.75;
$docScores = [];
$count = $this->totalDocumentsInCollection();
$avgFlen = $this->getAverageFieldLength();
$docTerms = array();

foreach ($keywords as $index => $term) {
$isLastKeyword = ($keywords->count() - 1) == $index;
$df = $this->totalMatchingDocuments($term, $isLastKeyword);
$idf = log($count / max(1, $df));
foreach ($this->getAllDocumentsForKeyword($term, false, $isLastKeyword) as $document) {
$docID = $document['doc_id'];
$tf = $document['hit_count'];
$idf = log(1 + ($count - $df + 0.5) / ($df + 0.5));
foreach ($this->getAllHitsForKeyword($term, true, $isLastKeyword) as $hit) {
$docID = $hit['doc_id'];
$tf = $hit['hit_count'];
$dlen = $hit['field_len'];
$fnorm = 1/sqrt($hit['field_len']);
$num = ($tfWeight + 1) * $tf;
$avgDlen = $avgFlen[$hit['field_id']];
$denom = $tfWeight
* ((1 - $dlWeight) + $dlWeight)
* ((1 - $dlWeight) + $dlWeight * $dlen / $avgDlen)
+ $tf;
$score = $idf * ($num / $denom);
$score = $fnorm * $idf * ($num / $denom);
$docScores[$docID] = isset($docScores[$docID]) ?
$docScores[$docID] + $score : $score;

if (!isset($docTerms[$docID])) {
$docTerms[$docID] = array();
}
$docTerms[$docID][$term] = 1;
}
}

Expand All @@ -128,7 +138,9 @@ public function search($phrase, $numOfResults = 100)
$docs = new Collection($docScores);

$totalHits = $docs->count();
$docs = $docs->map(function ($doc, $key) {
$docs = $docs->filter(function ($score, $docID) use ($docTerms, $keywords) {
return (count($docTerms[$docID]) == $keywords->count());
})->map(function ($doc, $key) {
return $key;
})->take($numOfResults);
$stopTimer = microtime(true);
Expand Down Expand Up @@ -254,6 +266,24 @@ public function getAllDocumentsForKeyword($keyword, $noLimit = false, $isLastKey
return $this->getAllDocumentsForStrictKeyword($word, $noLimit);
}

/**
* @param $keyword
* @param bool $noLimit
* @param bool $isLastKeyword
*
* @return Collection
*/
public function getAllHitsForKeyword($keyword, $noLimit = false, $isLastKeyword = false)
{
$word = $this->getWordlistByKeyword($keyword, $isLastKeyword);
if (!isset($word[0])) {
return new Collection([]);
}
// TODO: Fuzzy

return $this->getAllHitsForStrictKeyword($word, $noLimit);
}

/**
* @param $keyword
* @param bool $noLimit
Expand Down Expand Up @@ -506,4 +536,42 @@ private function getAllDocumentsForStrictKeyword($word, $noLimit)
$stmtDoc->execute();
return new Collection($stmtDoc->fetchAll(PDO::FETCH_ASSOC));
}

/**
* @param $word
* @param $noLimit
*
* @return Collection
*/
private function getAllHitsForStrictKeyword($word, $noLimit)
{
$query = "SELECT * FROM hitlist WHERE term_id = :id ORDER BY hit_count DESC";
// TODO: limit?
$stmtDoc = $this->index->prepare($query);

$stmtDoc->bindValue(':id', $word[0]['id']);
$stmtDoc->execute();
return new Collection($stmtDoc->fetchAll(PDO::FETCH_ASSOC));
}

/**
* @return $avgFieldLen
*/
private function getAverageFieldLength()
{
$query = "SELECT MAX(field_id) FROM docinfo";
$stmtDoc = $this->index->prepare($query);
$stmtDoc->execute();
$noFields = $stmtDoc->fetch(PDO::FETCH_NUM)[0] + 1;

$avgFlen = array();
for ($field = 0; $field < $noFields; $field++) {
$query = "SELECT AVG(num_terms) FROM docinfo WHERE field_id = :field_id";
$stmtDoc = $this->index->prepare($query);
$stmtDoc->bindValue(':field_id', $field);
$stmtDoc->execute();
$avgFlen[$field] = $stmtDoc->fetch(PDO::FETCH_NUM)[0];
}
return $avgFlen;
}
}