diff options
Diffstat (limited to 'scratch')
-rw-r--r-- | scratch/semgrep/requirements.txt | 2 | ||||
-rw-r--r-- | scratch/semgrep/semantic-search.el | 148 | ||||
-rw-r--r-- | scratch/semgrep/server.py | 116 | ||||
-rw-r--r-- | scratch/semgrep/tests.py | 15 | ||||
-rw-r--r-- | scratch/semgrep/utils.py | 21 |
5 files changed, 302 insertions, 0 deletions
diff --git a/scratch/semgrep/requirements.txt b/scratch/semgrep/requirements.txt new file mode 100644 index 0000000..3e493ea --- /dev/null +++ b/scratch/semgrep/requirements.txt @@ -0,0 +1,2 @@ +sentence-transformers +chromadb diff --git a/scratch/semgrep/semantic-search.el b/scratch/semgrep/semantic-search.el new file mode 100644 index 0000000..e1692d0 --- /dev/null +++ b/scratch/semgrep/semantic-search.el @@ -0,0 +1,148 @@ +;;; semantic-search.el --- Search for semantic similarity of text -*- lexical-binding: t; -*- +;; +;; Copyright (C) 2024 Óscar Nájera +;; +;; Author: Óscar Nájera <hi@oscarnajera.com> +;; Maintainer: Óscar Nájera <hi@oscarnajera.com> +;; Created: February 04, 2024 +;; Modified: February 04, 2024 +;; Version: 0.1.0 +;; Keywords: convenience data docs files hypermedia i18n matching tools +;; Homepage: https://github.com/titan/semantic-search +;; Package-Requires: ((emacs "27.1")) +;; +;; This file is not part of GNU Emacs. +;; +;;; Commentary: +;; +;; Search for semantic similarity of documents at a paragraph level +;; +;;; Code: + + +(require 'url) +(require 'org-element) +(require 'org-roam-db) +(require 'dash) + +;; Silence byte-compiler. +(defvar url-http-end-of-headers) + +(defcustom semantic-search-server-url "http://localhost:8080" + "Address where the Chromadb server is listening." + :type 'url + :group 'semantic-search) + +(defun semantic-search--connect (method data) + "Synchronous query to the server." + (let ((url-request-method "POST") + (url-request-extra-headers '(("Content-Type" . "application/json"))) + (url-request-data (encode-coding-string + (json-serialize `(,method ,data)) + 'utf-8))) + (with-current-buffer + (url-retrieve-synchronously semantic-search-server-url) + (goto-char url-http-end-of-headers) + (json-read)))) + +(defun semantic-search--org-id (paragraph &optional default) + (--> + (org-element-map + (org-element-property :parent paragraph) + 'node-property + (lambda (np) + (cons + (org-element-property :key np) + (org-element-property :value np)))) + (assoc 'ID it #'string=) + (cdr it) + (org-string-nw-p it) + (or it default))) + +(defun semantic-search--prepare-paragraph (file-id) + (lambda (paragraph) + (list + :document (substring-no-properties (org-element-interpret-data paragraph)) + :metadata (list :start-point + (org-element-property :begin paragraph) + :node-id + (semantic-search--org-id paragraph file-id))))) + +(defun semantic-search--add-buffer () + (interactive) + (if (eq major-mode 'org-mode) + (-some--> + (org-element-map + (org-element-parse-buffer) + 'paragraph + (semantic-search--prepare-paragraph (org-id-get (point-min) 'create))) + (cl-coerce it 'vector) + ;; (json-serialize it) + ;; (f-write it 'utf-8 "/tmp/out.json") + ;; (message "%S" it) + (semantic-search--connect :insert it)) + (user-error "This only works on org-mode"))) + +(defun semantic-search--roam-data (entries) + (thread-last + (cl-mapcar (lambda (meta) + (alist-get 'node-id meta)) + entries) + (delete-dups) + (vconcat) + (org-roam-db-query [:select [id title file] + :from nodes + :where (in id $v1)]))) + +(defun semantic-search--del-buffer (org-ids) + (interactive (list (org-id-get))) + (unless (null org-ids) + (semantic-search--connect :delete org-ids))) + +(defun semantic-search-pick-org-element () + (when-let ((context (ignore-errors (org-element-context)))) + (filter-buffer-substring (org-element-property :begin context) + (org-element-property :end context)))) + +(defun semantic-search--sync-db () + (org-roam-dolist-with-progress (file (nreverse (org-roam-list-files))) + "importing to semantic search" + (org-roam-with-file file nil + (semantic-search--add-buffer)))) + +;; (semantic-search--sync-db) +(defun semantic-search (text) + (interactive (list (or (semantic-search-pick-org-element) + (read-from-minibuffer "What are you looking for? ")))) + (-let (((&alist 'distances 'documents 'metadatas) + (semantic-search--connect :query text))) + (with-current-buffer (get-buffer-create "*Semantic Search*") + (erase-buffer) + (org-mode) + (insert "#+title: Looking for:\n" text "\n") + (cl-mapc + (lambda (entry-distances entry-document entry-metadatas) + (let ((data (semantic-search--roam-data entry-metadatas))) + (cl-mapc + (lambda (d paragraph meta) + (unless (zerop d) + (-let* ((node-id (alist-get 'node-id meta)) + ((_ title file) (assoc node-id data #'string=)) + (pos + (if file + (with-temp-buffer + (insert-file-contents file) + (line-number-at-pos (or (alist-get 'start-point meta) 1))) + 1))) + (insert + (format "* [[file:%s::%d][%s]]\n" file pos title) + "- Distance :: " (number-to-string d) "\n" + paragraph ?\n) + (org-fill-paragraph)))) + entry-distances entry-document entry-metadatas))) + distances documents metadatas) + (goto-char (point-min)) + (display-buffer (current-buffer))))) + +(provide 'semantic-search) +;;; semantic-search.el ends here diff --git a/scratch/semgrep/server.py b/scratch/semgrep/server.py new file mode 100644 index 0000000..becabbb --- /dev/null +++ b/scratch/semgrep/server.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +import argparse +import collections +import hashlib +import json +from http.server import BaseHTTPRequestHandler, HTTPServer + +import chromadb + + +def checksum(string: str): + sha256 = hashlib.sha256() + sha256.update(string.encode("utf-8")) + return sha256.hexdigest()[:32] + + +def ensure_list(data): + if isinstance(data, str): + return [data] + if isinstance(data, list): + if all(isinstance(l, str) for l in data): + return data + raise ValueError("Data must be a list of strings") + + +def delete_nodes(nodes): + for node in nodes: + collection.delete(where={"node-id": node}) + + +class MyRequestHandler(BaseHTTPRequestHandler): + def do_POST(self): + content_length = int(self.headers["Content-Length"]) + post_data = self.rfile.read(content_length).decode("utf-8") + + try: + data = json.loads(post_data) + response_message = f"Received POST request with data: '{data}'\n" + self.log_message(response_message) + except ValueError: + response_message = "Invalid JSON data" + self.send_response(400) + + if query := data.get("query"): + self.log_message("Processing query '%s'", query.replace("\n", " ").strip()) + response = collection.query(query_texts=ensure_list(query)) + elif delete_set := data.get("delete"): + delete_nodes(ensure_list(delete_set)) + response = f"Deleted nodes {delete_set}" + elif paragraphs := data.get("insert"): + data, metadata = drop_duplicates(paragraphs) + nodes = set(m.get("node-id") for m in metadata) + self.log_message("Processing metadata %s", nodes) + delete_nodes(nodes) + collection.add( + documents=data, metadatas=metadata, ids=list(map(checksum, data)) + ) + response = f"Successfully inserted {nodes}" + else: + raise ValueError(f"Used wrong method. Sent: {data.keys()}") + + response_message = json.dumps(response) + + self.send_response(200) + self.send_header("Content-type", "text/plain") + self.end_headers() + self.wfile.write(response_message.encode("utf-8")) + + +def run_server(port=8080): + server_address = ("", port) + httpd = HTTPServer(server_address, MyRequestHandler) + print(f"Server running on port {port}") + httpd.serve_forever() + + +def drop_duplicates(paragraphs): + data = [data["document"].replace("\n", " ").strip() for data in paragraphs] + metadata = [data["metadata"] for data in paragraphs] + dups = (x for x, count in collections.Counter(data).items() if count > 1) + to_drop = [] + for no in dups: + to_drop.extend([i for i, d in enumerate(data) if d == no][1:]) + to_drop.sort(reverse=True) + for index in to_drop: + data.pop(index) + metadata.pop(index) + return data, metadata + + +def parse_arguments(args=None): + parser = argparse.ArgumentParser( + description="Run Semantic database server", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "-db", "--database", default="org-roam", help="Name of the collection" + ) + parser.add_argument( + "-D", + "--database-dir", + default="semantic-roam", + help="Directory where to store database files", + ) + parser.add_argument( + "-p", "--port", default=8080, type=int, help="Port where server listens" + ) + + return parser.parse_args(args) + + +if __name__ == "__main__": + args = parse_arguments() + client = chromadb.PersistentClient(path=args.database_dir) + collection = client.get_or_create_collection(args.database) + run_server(args.port) diff --git a/scratch/semgrep/tests.py b/scratch/semgrep/tests.py new file mode 100644 index 0000000..0e5d233 --- /dev/null +++ b/scratch/semgrep/tests.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +import server as s + + +def test_drop_dup(): + sample = [ + {"document": "Hello", "metadata": 5}, + {"document": "World", "metadata": 8}, + {"document": "Hello", "metadata": 6}, + {"document": "Good", "metadata": 3}, + {"document": "World", "metadata": 9}, + {"document": "World\n\n", "metadata": 9}, + ] + + assert s.drop_duplicates(sample) == (["Hello", "World", "Good"], [5, 8, 3]) diff --git a/scratch/semgrep/utils.py b/scratch/semgrep/utils.py new file mode 100644 index 0000000..66ee410 --- /dev/null +++ b/scratch/semgrep/utils.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# This functions don't necessarily stand alone. I used on debug process +import uuid + +response = collection.query( + query_texts="machines scale", + where={"node-id": "496d4874-be24-4601-8a87-214d55e11297"}, +) +collection.query(query_texts="machines scale") + + +def get_node(node_id): + return collection.get(where={"node-id": node_id}) + + +def get_data(amount): + nodeid = str(uuid.uuid4()) + data = ["heelo" for _ in range(amount)] + metadata = [{"node-id": nodeid, "point": i} for i in range(amount)] + ids = [str(uuid.uuid4()) for _ in range(amount)] + return {"documents": data, "metadatas": metadata, "ids": ids} |