Generative AI

Design an Accurate Rerank and Rerank Pipeline with ZeroEntropy Zerank-2 Reranker

print("n" + "="*70 + "nPART 4: NDCG@10 evaluationn" + "="*70)
eval_set = [
   {"query": "Where is most ATP produced in the cell?",
    "rels": {0: 2, 2: 3, 4: 2, 6: 1, 8: 3}},
   {"query": "How do plants capture light energy?",
    "rels": {1: 3, 9: 1}},
   {"query": "How are proteins made and packaged in a cell?",
    "rels": {5: 3, 7: 2}},
]
def dcg(rels):
   rels = np.asarray(rels, dtype=float)
   return np.sum((2**rels - 1) / np.log2(np.arange(2, rels.size + 2)))
def ndcg_at_k(ranked_doc_ids, rel_map, k=10):
   gains = [rel_map.get(d, 0) for d in ranked_doc_ids[:k]]
   ideal = sorted(rel_map.values(), reverse=True)[:k]
   idcg = dcg(ideal)
   return dcg(gains) / idcg if idcg > 0 else 0.0
base_scores, rr_scores = [], []
for ex in eval_set:
   q, rel_map = ex["query"], ex["rels"]
   q_emb = bi.encode(q, convert_to_tensor=True, normalize_embeddings=True)
   hits = util.semantic_search(q_emb, corpus_emb, top_k=len(corpus))[0]
   base_order = [h["corpus_id"] for h in hits]
   base_scores.append(ndcg_at_k(base_order, rel_map))
   rr = reranker.rank(q, [corpus[i] for i in base_order], convert_to_tensor=True)
   rr_order = [base_order[r["corpus_id"]] for r in rr]
   rr_scores.append(ndcg_at_k(rr_order, rel_map))
print(f"{'Query':45s} {'bi-encoder':>12s} {'+ zerank-2':>12s}")
for ex, b, r in zip(eval_set, base_scores, rr_scores):
   print(f"{ex['query'][:43]:45s} {b:12.4f} {r:12.4f}")
print("-"*72)
print(f"{'AVERAGE NDCG@10':45s} {np.mean(base_scores):12.4f} {np.mean(rr_scores):12.4f}")
print(f"nReranking lift: {np.mean(rr_scores)-np.mean(base_scores):+.4f} NDCG@10")

Source link

Related Articles

Leave a Reply

Your email address will not be published. Required fields are marked *

Back to top button