From d95bddf2af14e8ac5cc147903435a005039c173e Mon Sep 17 00:00:00 2001 From: Oscar Najera Date: Sun, 4 Feb 2024 21:45:42 +0100 Subject: Rename semgrep to semantic-search --- scratch/semgrep/semantic-search.el | 148 +++++++++++++++++++++++++++++++++++++ scratch/semgrep/semgrep.el | 138 ---------------------------------- scratch/semgrep/server.py | 12 ++- 3 files changed, 158 insertions(+), 140 deletions(-) create mode 100644 scratch/semgrep/semantic-search.el delete mode 100644 scratch/semgrep/semgrep.el (limited to 'scratch') diff --git a/scratch/semgrep/semantic-search.el b/scratch/semgrep/semantic-search.el new file mode 100644 index 0000000..e1692d0 --- /dev/null +++ b/scratch/semgrep/semantic-search.el @@ -0,0 +1,148 @@ +;;; semantic-search.el --- Search for semantic similarity of text -*- lexical-binding: t; -*- +;; +;; Copyright (C) 2024 Óscar Nájera +;; +;; Author: Óscar Nájera +;; Maintainer: Óscar Nájera +;; Created: February 04, 2024 +;; Modified: February 04, 2024 +;; Version: 0.1.0 +;; Keywords: convenience data docs files hypermedia i18n matching tools +;; Homepage: https://github.com/titan/semantic-search +;; Package-Requires: ((emacs "27.1")) +;; +;; This file is not part of GNU Emacs. +;; +;;; Commentary: +;; +;; Search for semantic similarity of documents at a paragraph level +;; +;;; Code: + + +(require 'url) +(require 'org-element) +(require 'org-roam-db) +(require 'dash) + +;; Silence byte-compiler. +(defvar url-http-end-of-headers) + +(defcustom semantic-search-server-url "http://localhost:8080" + "Address where the Chromadb server is listening." + :type 'url + :group 'semantic-search) + +(defun semantic-search--connect (method data) + "Synchronous query to the server." + (let ((url-request-method "POST") + (url-request-extra-headers '(("Content-Type" . "application/json"))) + (url-request-data (encode-coding-string + (json-serialize `(,method ,data)) + 'utf-8))) + (with-current-buffer + (url-retrieve-synchronously semantic-search-server-url) + (goto-char url-http-end-of-headers) + (json-read)))) + +(defun semantic-search--org-id (paragraph &optional default) + (--> + (org-element-map + (org-element-property :parent paragraph) + 'node-property + (lambda (np) + (cons + (org-element-property :key np) + (org-element-property :value np)))) + (assoc 'ID it #'string=) + (cdr it) + (org-string-nw-p it) + (or it default))) + +(defun semantic-search--prepare-paragraph (file-id) + (lambda (paragraph) + (list + :document (substring-no-properties (org-element-interpret-data paragraph)) + :metadata (list :start-point + (org-element-property :begin paragraph) + :node-id + (semantic-search--org-id paragraph file-id))))) + +(defun semantic-search--add-buffer () + (interactive) + (if (eq major-mode 'org-mode) + (-some--> + (org-element-map + (org-element-parse-buffer) + 'paragraph + (semantic-search--prepare-paragraph (org-id-get (point-min) 'create))) + (cl-coerce it 'vector) + ;; (json-serialize it) + ;; (f-write it 'utf-8 "/tmp/out.json") + ;; (message "%S" it) + (semantic-search--connect :insert it)) + (user-error "This only works on org-mode"))) + +(defun semantic-search--roam-data (entries) + (thread-last + (cl-mapcar (lambda (meta) + (alist-get 'node-id meta)) + entries) + (delete-dups) + (vconcat) + (org-roam-db-query [:select [id title file] + :from nodes + :where (in id $v1)]))) + +(defun semantic-search--del-buffer (org-ids) + (interactive (list (org-id-get))) + (unless (null org-ids) + (semantic-search--connect :delete org-ids))) + +(defun semantic-search-pick-org-element () + (when-let ((context (ignore-errors (org-element-context)))) + (filter-buffer-substring (org-element-property :begin context) + (org-element-property :end context)))) + +(defun semantic-search--sync-db () + (org-roam-dolist-with-progress (file (nreverse (org-roam-list-files))) + "importing to semantic search" + (org-roam-with-file file nil + (semantic-search--add-buffer)))) + +;; (semantic-search--sync-db) +(defun semantic-search (text) + (interactive (list (or (semantic-search-pick-org-element) + (read-from-minibuffer "What are you looking for? ")))) + (-let (((&alist 'distances 'documents 'metadatas) + (semantic-search--connect :query text))) + (with-current-buffer (get-buffer-create "*Semantic Search*") + (erase-buffer) + (org-mode) + (insert "#+title: Looking for:\n" text "\n") + (cl-mapc + (lambda (entry-distances entry-document entry-metadatas) + (let ((data (semantic-search--roam-data entry-metadatas))) + (cl-mapc + (lambda (d paragraph meta) + (unless (zerop d) + (-let* ((node-id (alist-get 'node-id meta)) + ((_ title file) (assoc node-id data #'string=)) + (pos + (if file + (with-temp-buffer + (insert-file-contents file) + (line-number-at-pos (or (alist-get 'start-point meta) 1))) + 1))) + (insert + (format "* [[file:%s::%d][%s]]\n" file pos title) + "- Distance :: " (number-to-string d) "\n" + paragraph ?\n) + (org-fill-paragraph)))) + entry-distances entry-document entry-metadatas))) + distances documents metadatas) + (goto-char (point-min)) + (display-buffer (current-buffer))))) + +(provide 'semantic-search) +;;; semantic-search.el ends here diff --git a/scratch/semgrep/semgrep.el b/scratch/semgrep/semgrep.el deleted file mode 100644 index fdfcfdb..0000000 --- a/scratch/semgrep/semgrep.el +++ /dev/null @@ -1,138 +0,0 @@ -;;; semgrep.el --- Semantic search -*- lexical-binding: t; -*- -;; -;; Copyright (C) 2023 Óscar Nájera -;; -;; Author: Óscar Nájera -;; Maintainer: Óscar Nájera -;; Created: November 07, 2023 -;; Modified: November 07, 2023 -;; Version: 0.0.1 -;; Keywords: abbrev bib c calendar comm convenience data docs emulations extensions faces files frames games hardware help hypermedia i18n internal languages lisp local maint mail matching mouse multimedia news outlines processes terminals tex tools unix vc wp -;; Homepage: https://github.com/titan/semgrep -;; Package-Requires: ((emacs "27.1")) -;; -;; This file is not part of GNU Emacs. -;; -;;; Commentary: -;; -;; semantically search on my database by paragraph -;; -;;; Code: -(require 'url) -(require 'org-element) -(require 'org-roam-db) -(require 'dash) - -;; Silence byte-compiler. -(defvar url-http-end-of-headers) - -(defcustom semgrep-server-url "http://localhost:8080" - "Address where the Python server with the chromadb is listening." - :type 'url - :group 'semgrep) - -(defun semgrep--connect (method data) - (let ((url-request-method "POST") - (url-request-extra-headers '(("Content-Type" . "application/json"))) - (url-request-data (encode-coding-string - (json-serialize `(,method ,data)) - 'utf-8))) - (with-current-buffer - (url-retrieve-synchronously semgrep-server-url) - (goto-char url-http-end-of-headers) - (json-read)))) - -(defun semgrep--get-node-id (paragraph &optional default) - (--> - (org-element-map - (org-element-property :parent paragraph) - 'node-property - (lambda (np) - (cons - (org-element-property :key np) - (org-element-property :value np)))) - (assoc 'ID it #'string=) - (cdr it) - (org-string-nw-p it) - (or it default))) - -(defun semgrep--prepare-paragraph (file-id) - (lambda (paragraph) - (list - :document (substring-no-properties (org-element-interpret-data paragraph)) - :metadata (list :start-point - (org-element-property :begin paragraph) - :node-id - (semgrep--get-node-id paragraph file-id))))) - -(defun semgrep--add-buffer () - (interactive) - (if (eq major-mode 'org-mode) - (-some--> - (org-element-map - (org-element-parse-buffer) - 'paragraph - (semgrep--prepare-paragraph (org-id-get (point-min) 'create))) - (cl-coerce it 'vector) - ;; (json-serialize it) - ;; (f-write it 'utf-8 "/tmp/out.json") - ;; (message "%S" it) - (semgrep--connect :insert it)) - (user-error "This only works on org-mode"))) - -(defun semgrep--roam-data (entries) - (thread-last - (cl-mapcar (lambda (meta) - (alist-get 'node-id meta)) - entries) - (delete-dups) - (vconcat) - (org-roam-db-query [:select [id title file] - :from nodes - :where (in id $v1)]))) - -(defun semgrep-pick-org-element () - (when-let ((context (ignore-errors (org-element-context)))) - (filter-buffer-substring (org-element-property :begin context) - (org-element-property :end context)))) - -(defun semgrep--sync-db () - (org-roam-dolist-with-progress (file (nreverse (org-roam-list-files))) - "importing to semantic search" - (org-roam-with-file file nil - (semgrep--add-buffer)))) - -;; (semgrep--sync-db) -(defun semgrep-search (text) - (interactive (list (or (semgrep-pick-org-element) - (read-from-minibuffer "What are you looking for? ")))) - (-let (((&alist 'distances 'documents 'metadatas) - (semgrep--connect :query text))) - (with-current-buffer (get-buffer-create "*Semantic Search*") - (erase-buffer) - (org-mode) - (insert "#+title: Looking for:\n" text "\n") - (cl-mapc - (lambda (entry-distances entry-document entry-metadatas) - (let ((data (semgrep--roam-data entry-metadatas))) - (cl-mapc - (lambda (d paragraph meta) - (unless (zerop d) - (-let* ((node-id (alist-get 'node-id meta)) - ((_ title file) (assoc node-id data #'string=)) - (pos - (with-temp-buffer - (insert-file-contents file) - (line-number-at-pos (or (alist-get 'start-point meta) 1))))) - (insert - (format "* [[file:%s::%d][%s]]\n" file pos title) - "- Distance :: " (number-to-string d) "\n" - paragraph ?\n) - (org-fill-paragraph)))) - entry-distances entry-document entry-metadatas))) - distances documents metadatas) - (goto-char (point-min)) - (display-buffer (current-buffer))))) - -(provide 'semgrep) -;;; semgrep.el ends here diff --git a/scratch/semgrep/server.py b/scratch/semgrep/server.py index 6f3ebcd..becabbb 100644 --- a/scratch/semgrep/server.py +++ b/scratch/semgrep/server.py @@ -23,6 +23,11 @@ def ensure_list(data): raise ValueError("Data must be a list of strings") +def delete_nodes(nodes): + for node in nodes: + collection.delete(where={"node-id": node}) + + class MyRequestHandler(BaseHTTPRequestHandler): def do_POST(self): content_length = int(self.headers["Content-Length"]) @@ -31,6 +36,7 @@ class MyRequestHandler(BaseHTTPRequestHandler): try: data = json.loads(post_data) response_message = f"Received POST request with data: '{data}'\n" + self.log_message(response_message) except ValueError: response_message = "Invalid JSON data" self.send_response(400) @@ -38,12 +44,14 @@ class MyRequestHandler(BaseHTTPRequestHandler): if query := data.get("query"): self.log_message("Processing query '%s'", query.replace("\n", " ").strip()) response = collection.query(query_texts=ensure_list(query)) + elif delete_set := data.get("delete"): + delete_nodes(ensure_list(delete_set)) + response = f"Deleted nodes {delete_set}" elif paragraphs := data.get("insert"): data, metadata = drop_duplicates(paragraphs) nodes = set(m.get("node-id") for m in metadata) self.log_message("Processing metadata %s", nodes) - for node in nodes: - collection.delete(where={"node-id": node}) + delete_nodes(nodes) collection.add( documents=data, metadatas=metadata, ids=list(map(checksum, data)) ) -- cgit v1.2.3