aboutsummaryrefslogtreecommitdiffstats
path: root/scratch
diff options
context:
space:
mode:
Diffstat (limited to 'scratch')
-rw-r--r--scratch/semgrep/requirements.txt2
-rw-r--r--scratch/semgrep/semantic-search.el148
-rw-r--r--scratch/semgrep/server.py116
-rw-r--r--scratch/semgrep/tests.py15
-rw-r--r--scratch/semgrep/utils.py21
5 files changed, 302 insertions, 0 deletions
diff --git a/scratch/semgrep/requirements.txt b/scratch/semgrep/requirements.txt
new file mode 100644
index 0000000..3e493ea
--- /dev/null
+++ b/scratch/semgrep/requirements.txt
@@ -0,0 +1,2 @@
+sentence-transformers
+chromadb
diff --git a/scratch/semgrep/semantic-search.el b/scratch/semgrep/semantic-search.el
new file mode 100644
index 0000000..e1692d0
--- /dev/null
+++ b/scratch/semgrep/semantic-search.el
@@ -0,0 +1,148 @@
+;;; semantic-search.el --- Search for semantic similarity of text -*- lexical-binding: t; -*-
+;;
+;; Copyright (C) 2024 Óscar Nájera
+;;
+;; Author: Óscar Nájera <hi@oscarnajera.com>
+;; Maintainer: Óscar Nájera <hi@oscarnajera.com>
+;; Created: February 04, 2024
+;; Modified: February 04, 2024
+;; Version: 0.1.0
+;; Keywords: convenience data docs files hypermedia i18n matching tools
+;; Homepage: https://github.com/titan/semantic-search
+;; Package-Requires: ((emacs "27.1"))
+;;
+;; This file is not part of GNU Emacs.
+;;
+;;; Commentary:
+;;
+;; Search for semantic similarity of documents at a paragraph level
+;;
+;;; Code:
+
+
+(require 'url)
+(require 'org-element)
+(require 'org-roam-db)
+(require 'dash)
+
+;; Silence byte-compiler.
+(defvar url-http-end-of-headers)
+
+(defcustom semantic-search-server-url "http://localhost:8080"
+ "Address where the Chromadb server is listening."
+ :type 'url
+ :group 'semantic-search)
+
+(defun semantic-search--connect (method data)
+ "Synchronous query to the server."
+ (let ((url-request-method "POST")
+ (url-request-extra-headers '(("Content-Type" . "application/json")))
+ (url-request-data (encode-coding-string
+ (json-serialize `(,method ,data))
+ 'utf-8)))
+ (with-current-buffer
+ (url-retrieve-synchronously semantic-search-server-url)
+ (goto-char url-http-end-of-headers)
+ (json-read))))
+
+(defun semantic-search--org-id (paragraph &optional default)
+ (-->
+ (org-element-map
+ (org-element-property :parent paragraph)
+ 'node-property
+ (lambda (np)
+ (cons
+ (org-element-property :key np)
+ (org-element-property :value np))))
+ (assoc 'ID it #'string=)
+ (cdr it)
+ (org-string-nw-p it)
+ (or it default)))
+
+(defun semantic-search--prepare-paragraph (file-id)
+ (lambda (paragraph)
+ (list
+ :document (substring-no-properties (org-element-interpret-data paragraph))
+ :metadata (list :start-point
+ (org-element-property :begin paragraph)
+ :node-id
+ (semantic-search--org-id paragraph file-id)))))
+
+(defun semantic-search--add-buffer ()
+ (interactive)
+ (if (eq major-mode 'org-mode)
+ (-some-->
+ (org-element-map
+ (org-element-parse-buffer)
+ 'paragraph
+ (semantic-search--prepare-paragraph (org-id-get (point-min) 'create)))
+ (cl-coerce it 'vector)
+ ;; (json-serialize it)
+ ;; (f-write it 'utf-8 "/tmp/out.json")
+ ;; (message "%S" it)
+ (semantic-search--connect :insert it))
+ (user-error "This only works on org-mode")))
+
+(defun semantic-search--roam-data (entries)
+ (thread-last
+ (cl-mapcar (lambda (meta)
+ (alist-get 'node-id meta))
+ entries)
+ (delete-dups)
+ (vconcat)
+ (org-roam-db-query [:select [id title file]
+ :from nodes
+ :where (in id $v1)])))
+
+(defun semantic-search--del-buffer (org-ids)
+ (interactive (list (org-id-get)))
+ (unless (null org-ids)
+ (semantic-search--connect :delete org-ids)))
+
+(defun semantic-search-pick-org-element ()
+ (when-let ((context (ignore-errors (org-element-context))))
+ (filter-buffer-substring (org-element-property :begin context)
+ (org-element-property :end context))))
+
+(defun semantic-search--sync-db ()
+ (org-roam-dolist-with-progress (file (nreverse (org-roam-list-files)))
+ "importing to semantic search"
+ (org-roam-with-file file nil
+ (semantic-search--add-buffer))))
+
+;; (semantic-search--sync-db)
+(defun semantic-search (text)
+ (interactive (list (or (semantic-search-pick-org-element)
+ (read-from-minibuffer "What are you looking for? "))))
+ (-let (((&alist 'distances 'documents 'metadatas)
+ (semantic-search--connect :query text)))
+ (with-current-buffer (get-buffer-create "*Semantic Search*")
+ (erase-buffer)
+ (org-mode)
+ (insert "#+title: Looking for:\n" text "\n")
+ (cl-mapc
+ (lambda (entry-distances entry-document entry-metadatas)
+ (let ((data (semantic-search--roam-data entry-metadatas)))
+ (cl-mapc
+ (lambda (d paragraph meta)
+ (unless (zerop d)
+ (-let* ((node-id (alist-get 'node-id meta))
+ ((_ title file) (assoc node-id data #'string=))
+ (pos
+ (if file
+ (with-temp-buffer
+ (insert-file-contents file)
+ (line-number-at-pos (or (alist-get 'start-point meta) 1)))
+ 1)))
+ (insert
+ (format "* [[file:%s::%d][%s]]\n" file pos title)
+ "- Distance :: " (number-to-string d) "\n"
+ paragraph ?\n)
+ (org-fill-paragraph))))
+ entry-distances entry-document entry-metadatas)))
+ distances documents metadatas)
+ (goto-char (point-min))
+ (display-buffer (current-buffer)))))
+
+(provide 'semantic-search)
+;;; semantic-search.el ends here
diff --git a/scratch/semgrep/server.py b/scratch/semgrep/server.py
new file mode 100644
index 0000000..becabbb
--- /dev/null
+++ b/scratch/semgrep/server.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+import argparse
+import collections
+import hashlib
+import json
+from http.server import BaseHTTPRequestHandler, HTTPServer
+
+import chromadb
+
+
+def checksum(string: str):
+ sha256 = hashlib.sha256()
+ sha256.update(string.encode("utf-8"))
+ return sha256.hexdigest()[:32]
+
+
+def ensure_list(data):
+ if isinstance(data, str):
+ return [data]
+ if isinstance(data, list):
+ if all(isinstance(l, str) for l in data):
+ return data
+ raise ValueError("Data must be a list of strings")
+
+
+def delete_nodes(nodes):
+ for node in nodes:
+ collection.delete(where={"node-id": node})
+
+
+class MyRequestHandler(BaseHTTPRequestHandler):
+ def do_POST(self):
+ content_length = int(self.headers["Content-Length"])
+ post_data = self.rfile.read(content_length).decode("utf-8")
+
+ try:
+ data = json.loads(post_data)
+ response_message = f"Received POST request with data: '{data}'\n"
+ self.log_message(response_message)
+ except ValueError:
+ response_message = "Invalid JSON data"
+ self.send_response(400)
+
+ if query := data.get("query"):
+ self.log_message("Processing query '%s'", query.replace("\n", " ").strip())
+ response = collection.query(query_texts=ensure_list(query))
+ elif delete_set := data.get("delete"):
+ delete_nodes(ensure_list(delete_set))
+ response = f"Deleted nodes {delete_set}"
+ elif paragraphs := data.get("insert"):
+ data, metadata = drop_duplicates(paragraphs)
+ nodes = set(m.get("node-id") for m in metadata)
+ self.log_message("Processing metadata %s", nodes)
+ delete_nodes(nodes)
+ collection.add(
+ documents=data, metadatas=metadata, ids=list(map(checksum, data))
+ )
+ response = f"Successfully inserted {nodes}"
+ else:
+ raise ValueError(f"Used wrong method. Sent: {data.keys()}")
+
+ response_message = json.dumps(response)
+
+ self.send_response(200)
+ self.send_header("Content-type", "text/plain")
+ self.end_headers()
+ self.wfile.write(response_message.encode("utf-8"))
+
+
+def run_server(port=8080):
+ server_address = ("", port)
+ httpd = HTTPServer(server_address, MyRequestHandler)
+ print(f"Server running on port {port}")
+ httpd.serve_forever()
+
+
+def drop_duplicates(paragraphs):
+ data = [data["document"].replace("\n", " ").strip() for data in paragraphs]
+ metadata = [data["metadata"] for data in paragraphs]
+ dups = (x for x, count in collections.Counter(data).items() if count > 1)
+ to_drop = []
+ for no in dups:
+ to_drop.extend([i for i, d in enumerate(data) if d == no][1:])
+ to_drop.sort(reverse=True)
+ for index in to_drop:
+ data.pop(index)
+ metadata.pop(index)
+ return data, metadata
+
+
+def parse_arguments(args=None):
+ parser = argparse.ArgumentParser(
+ description="Run Semantic database server",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+ parser.add_argument(
+ "-db", "--database", default="org-roam", help="Name of the collection"
+ )
+ parser.add_argument(
+ "-D",
+ "--database-dir",
+ default="semantic-roam",
+ help="Directory where to store database files",
+ )
+ parser.add_argument(
+ "-p", "--port", default=8080, type=int, help="Port where server listens"
+ )
+
+ return parser.parse_args(args)
+
+
+if __name__ == "__main__":
+ args = parse_arguments()
+ client = chromadb.PersistentClient(path=args.database_dir)
+ collection = client.get_or_create_collection(args.database)
+ run_server(args.port)
diff --git a/scratch/semgrep/tests.py b/scratch/semgrep/tests.py
new file mode 100644
index 0000000..0e5d233
--- /dev/null
+++ b/scratch/semgrep/tests.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import server as s
+
+
+def test_drop_dup():
+ sample = [
+ {"document": "Hello", "metadata": 5},
+ {"document": "World", "metadata": 8},
+ {"document": "Hello", "metadata": 6},
+ {"document": "Good", "metadata": 3},
+ {"document": "World", "metadata": 9},
+ {"document": "World\n\n", "metadata": 9},
+ ]
+
+ assert s.drop_duplicates(sample) == (["Hello", "World", "Good"], [5, 8, 3])
diff --git a/scratch/semgrep/utils.py b/scratch/semgrep/utils.py
new file mode 100644
index 0000000..66ee410
--- /dev/null
+++ b/scratch/semgrep/utils.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+# This functions don't necessarily stand alone. I used on debug process
+import uuid
+
+response = collection.query(
+ query_texts="machines scale",
+ where={"node-id": "496d4874-be24-4601-8a87-214d55e11297"},
+)
+collection.query(query_texts="machines scale")
+
+
+def get_node(node_id):
+ return collection.get(where={"node-id": node_id})
+
+
+def get_data(amount):
+ nodeid = str(uuid.uuid4())
+ data = ["heelo" for _ in range(amount)]
+ metadata = [{"node-id": nodeid, "point": i} for i in range(amount)]
+ ids = [str(uuid.uuid4()) for _ in range(amount)]
+ return {"documents": data, "metadatas": metadata, "ids": ids}