diff options
-rw-r--r-- | scratch/semgrep/server.py | 37 | ||||
-rw-r--r-- | scratch/semgrep/tests.py | 15 |
2 files changed, 28 insertions, 24 deletions
diff --git a/scratch/semgrep/server.py b/scratch/semgrep/server.py index 87eac5e..6f3ebcd 100644 --- a/scratch/semgrep/server.py +++ b/scratch/semgrep/server.py @@ -8,7 +8,7 @@ from http.server import BaseHTTPRequestHandler, HTTPServer import chromadb -def checksum(string): +def checksum(string: str): sha256 = hashlib.sha256() sha256.update(string.encode("utf-8")) return sha256.hexdigest()[:32] @@ -30,23 +30,24 @@ class MyRequestHandler(BaseHTTPRequestHandler): try: data = json.loads(post_data) - # Process the JSON data response_message = f"Received POST request with data: '{data}'\n" except ValueError: response_message = "Invalid JSON data" self.send_response(400) if query := data.get("query"): - self.log_message("Processing query '%s'", query.strip()) + self.log_message("Processing query '%s'", query.replace("\n", " ").strip()) response = collection.query(query_texts=ensure_list(query)) - elif paragraph := data.get("store"): - data, metadata = drop_duplicates(paragraph) - node = set(m.get("node-id") for m in metadata) - self.log_message("Processing metadata %s", node) + elif paragraphs := data.get("insert"): + data, metadata = drop_duplicates(paragraphs) + nodes = set(m.get("node-id") for m in metadata) + self.log_message("Processing metadata %s", nodes) + for node in nodes: + collection.delete(where={"node-id": node}) collection.add( - documents=data, metadatas=metadata, ids=[checksum(l) for l in data] + documents=data, metadatas=metadata, ids=list(map(checksum, data)) ) - response = f"Successfully inserted {node}" + response = f"Successfully inserted {nodes}" else: raise ValueError(f"Used wrong method. Sent: {data.keys()}") @@ -65,9 +66,9 @@ def run_server(port=8080): httpd.serve_forever() -def drop_duplicates(paragraph): - data = [data["document"] for data in paragraph] - metadata = [data["metadata"] for data in paragraph] +def drop_duplicates(paragraphs): + data = [data["document"].replace("\n", " ").strip() for data in paragraphs] + metadata = [data["metadata"] for data in paragraphs] dups = (x for x, count in collections.Counter(data).items() if count > 1) to_drop = [] for no in dups: @@ -79,18 +80,6 @@ def drop_duplicates(paragraph): return data, metadata -def test(): - sample = [ - {"document": "Hello", "metadata": 5}, - {"document": "World", "metadata": 8}, - {"document": "Hello", "metadata": 6}, - {"document": "Good", "metadata": 3}, - {"document": "World", "metadata": 9}, - ] - - assert drop_duplicates(sample) == (["Hello", "World", "Good"], [5, 8, 3]) - - def parse_arguments(args=None): parser = argparse.ArgumentParser( description="Run Semantic database server", diff --git a/scratch/semgrep/tests.py b/scratch/semgrep/tests.py new file mode 100644 index 0000000..0e5d233 --- /dev/null +++ b/scratch/semgrep/tests.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +import server as s + + +def test_drop_dup(): + sample = [ + {"document": "Hello", "metadata": 5}, + {"document": "World", "metadata": 8}, + {"document": "Hello", "metadata": 6}, + {"document": "Good", "metadata": 3}, + {"document": "World", "metadata": 9}, + {"document": "World\n\n", "metadata": 9}, + ] + + assert s.drop_duplicates(sample) == (["Hello", "World", "Good"], [5, 8, 3]) |