aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--scratch/semgrep/server.py37
-rw-r--r--scratch/semgrep/tests.py15
2 files changed, 28 insertions, 24 deletions
diff --git a/scratch/semgrep/server.py b/scratch/semgrep/server.py
index 87eac5e..6f3ebcd 100644
--- a/scratch/semgrep/server.py
+++ b/scratch/semgrep/server.py
@@ -8,7 +8,7 @@ from http.server import BaseHTTPRequestHandler, HTTPServer
import chromadb
-def checksum(string):
+def checksum(string: str):
sha256 = hashlib.sha256()
sha256.update(string.encode("utf-8"))
return sha256.hexdigest()[:32]
@@ -30,23 +30,24 @@ class MyRequestHandler(BaseHTTPRequestHandler):
try:
data = json.loads(post_data)
- # Process the JSON data
response_message = f"Received POST request with data: '{data}'\n"
except ValueError:
response_message = "Invalid JSON data"
self.send_response(400)
if query := data.get("query"):
- self.log_message("Processing query '%s'", query.strip())
+ self.log_message("Processing query '%s'", query.replace("\n", " ").strip())
response = collection.query(query_texts=ensure_list(query))
- elif paragraph := data.get("store"):
- data, metadata = drop_duplicates(paragraph)
- node = set(m.get("node-id") for m in metadata)
- self.log_message("Processing metadata %s", node)
+ elif paragraphs := data.get("insert"):
+ data, metadata = drop_duplicates(paragraphs)
+ nodes = set(m.get("node-id") for m in metadata)
+ self.log_message("Processing metadata %s", nodes)
+ for node in nodes:
+ collection.delete(where={"node-id": node})
collection.add(
- documents=data, metadatas=metadata, ids=[checksum(l) for l in data]
+ documents=data, metadatas=metadata, ids=list(map(checksum, data))
)
- response = f"Successfully inserted {node}"
+ response = f"Successfully inserted {nodes}"
else:
raise ValueError(f"Used wrong method. Sent: {data.keys()}")
@@ -65,9 +66,9 @@ def run_server(port=8080):
httpd.serve_forever()
-def drop_duplicates(paragraph):
- data = [data["document"] for data in paragraph]
- metadata = [data["metadata"] for data in paragraph]
+def drop_duplicates(paragraphs):
+ data = [data["document"].replace("\n", " ").strip() for data in paragraphs]
+ metadata = [data["metadata"] for data in paragraphs]
dups = (x for x, count in collections.Counter(data).items() if count > 1)
to_drop = []
for no in dups:
@@ -79,18 +80,6 @@ def drop_duplicates(paragraph):
return data, metadata
-def test():
- sample = [
- {"document": "Hello", "metadata": 5},
- {"document": "World", "metadata": 8},
- {"document": "Hello", "metadata": 6},
- {"document": "Good", "metadata": 3},
- {"document": "World", "metadata": 9},
- ]
-
- assert drop_duplicates(sample) == (["Hello", "World", "Good"], [5, 8, 3])
-
-
def parse_arguments(args=None):
parser = argparse.ArgumentParser(
description="Run Semantic database server",
diff --git a/scratch/semgrep/tests.py b/scratch/semgrep/tests.py
new file mode 100644
index 0000000..0e5d233
--- /dev/null
+++ b/scratch/semgrep/tests.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import server as s
+
+
+def test_drop_dup():
+ sample = [
+ {"document": "Hello", "metadata": 5},
+ {"document": "World", "metadata": 8},
+ {"document": "Hello", "metadata": 6},
+ {"document": "Good", "metadata": 3},
+ {"document": "World", "metadata": 9},
+ {"document": "World\n\n", "metadata": 9},
+ ]
+
+ assert s.drop_duplicates(sample) == (["Hello", "World", "Good"], [5, 8, 3])