semgrep package

author: Oscar Najera <hi@oscarnajera.com> 2023-12-08 14:13:14 +0100
committer: Oscar Najera <hi@oscarnajera.com> 2024-01-13 02:27:42 +0100
commit: 97d4f907f670b4a16edf972a9339c194822eb476 (patch)
tree: eb18e4c692aabd9364df2193ec88cc6a1a6937cd
parent: 2d9291a7f24268ce58a3d98a4ad9a16a05665df0 (diff)
download: dotfiles-97d4f907f670b4a16edf972a9339c194822eb476.tar.gz
dotfiles-97d4f907f670b4a16edf972a9339c194822eb476.tar.bz2
dotfiles-97d4f907f670b4a16edf972a9339c194822eb476.zip
5 files changed, 24 insertions, 129 deletions
diff --git a/.gitignore b/.gitignore
index b534581..1862e32 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /elisp/*.elc
+/scratch/semgrep/__pycache__/
diff --git a/scratch/semgrep/parse.py b/scratch/semgrep/parse.py
deleted file mode 100644
index 3846e3e..0000000
--- a/scratch/semgrep/parse.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/usr/bin/env python3
-
-import orgparse
-import pandas as pd
-import re
-
-
-def org_roam_nodes_to_dataframe(org_file):
-    # Load the org file into an OrgData object
-    org_data = orgparse.load(org_file)
-
-    # Define a function to extract the title of a node
-    def extract_title(node):
-        if node.heading:
-            # If the node has a heading, return it
-            return node.heading
-        else:
-            # Otherwise, extract the title from the org file using a regular expression
-            title_pattern = re.compile(r"^#\+title:\s*(.*)$", re.IGNORECASE)
-            match = title_pattern.search(node.body)
-            if match:
-                return match.group(1)
-            else:
-                # If the title is not found, extract it from the first line of the body
-                return re.sub(
-                    r"#\+title:", "", node.body.split("\n")[0], flags=re.IGNORECASE
-                ).strip()
-
-    # Define a function to recursively extract the bodies of a node and its descendants
-    def extract_node_nested_body(node):
-        body = node.body
-        for child in node.children:
-            body += (
-                "\n"
-                + child.level * "*"
-                + " "
-                + child.heading
-                + "\n"
-                + extract_node_nested_body(child)
-            )
-        return body.strip()
-
-    # Define a function to recursively extract the bodies of a node
-    # and its descendants when they are not other nodes
-    def extract_node_nested_body_exclusive(node):
-        body = node.body
-        for child in node.children:
-            if not child.properties.get("ID") and not child.properties.get("SEARCH"):
-                body += (
-                    "\n"
-                    + child.level * "*"
-                    + " "
-                    + child.heading
-                    + "\n"
-                    + extract_node_nested_body_exclusive(child)
-                )
-        return body.strip()
-
-    # Define a function to build the hierarchy of a node
-    def build_node_hierarchy(node):
-        hierarchy = [extract_title(node)]
-        parent = node.parent
-
-        # while parent and parent != org_data[0]:
-        while parent:
-            hierarchy.append(extract_title(parent))
-            parent = parent.parent
-        return " > ".join(reversed(hierarchy)).strip()
-
-    # Define a function to convert a node to a dictionary
-    def node_to_dict(node, file_name):
-        node_dict = {
-            "file_name": file_name,
-            "node_id": node.properties.get("ID"),
-            "node_title": extract_title(node),
-            "node_hierarchy": build_node_hierarchy(node),
-            "node_text": node.body,
-            "node_text_nested": extract_node_nested_body(node),
-            "node_text_nested_exclusive": extract_node_nested_body_exclusive(node),
-        }
-
-        return node_dict
-
-    # Create a list of all org-roam nodes in the OrgData object
-    nodes = [
-        node_to_dict(node, org_file)
-        for node in org_data[0][:]
-        if node.properties.get("ID")
-    ]
-
-    return pd.DataFrame(nodes)
-
-
-model.encode(
-    "What you need is the gist. Your mind needs the repetition to absorb the information. The true learning is on *doing* not in the input. If you can't use the material, you don't know. There is just a limited amount you can learn by listening. The rest you must do yourself. Courses give you ideas, you must figure out what to do with them."
-)
-model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
-
-query_embedding = model.encode("How big is London")
-passage_embedding = model.encode(
-    [
-        "London has 9,787,426 inhabitants at the 2011 census",
-        "London is known for its finacial district",
-        "London is full of criminals",
-        "Cairo is small",
-    ]
-)
-
-print("Similarity:", util.dot_score(query_embedding, passage_embedding))
diff --git a/scratch/semgrep/requirements.txt b/scratch/semgrep/requirements.txt
new file mode 100644
index 0000000..3e493ea
--- /dev/null
+++ b/scratch/semgrep/requirements.txt
@@ -0,0 +1,2 @@
+sentence-transformers
+chromadb
diff --git a/scratch/semgrep/test.org b/scratch/semgrep/test.org
deleted file mode 100644
index aac16d0..0000000
--- a/scratch/semgrep/test.org
+++ /dev/null
@@ -1,20 +0,0 @@
-#+title: Test
-
-It has started all wars.
-
-No more data. It is a case of food suply. No more data. It is a case of food
-supply and nothing else for the.
-
-How an that
-
-#+begin_src emacs-lisp :results value raw
-(json-encode
- (org-element-map
-     (org-element-parse-buffer)
-     'paragraph
-   (lambda (hl)
-     (car
-      (org-element-contents hl)))))
-#+end_src
-
-#+RESULTS:
diff --git a/scratch/semgrep/utils.py b/scratch/semgrep/utils.py
new file mode 100644
index 0000000..66ee410
--- /dev/null
+++ b/scratch/semgrep/utils.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+# This functions don't necessarily stand alone. I used on debug process
+import uuid
+
+response = collection.query(
+    query_texts="machines scale",
+    where={"node-id": "496d4874-be24-4601-8a87-214d55e11297"},
+)
+collection.query(query_texts="machines scale")
+
+
+def get_node(node_id):
+    return collection.get(where={"node-id": node_id})
+
+
+def get_data(amount):
+    nodeid = str(uuid.uuid4())
+    data = ["heelo" for _ in range(amount)]
+    metadata = [{"node-id": nodeid, "point": i} for i in range(amount)]
+    ids = [str(uuid.uuid4()) for _ in range(amount)]
+    return {"documents": data, "metadatas": metadata, "ids": ids}
author	Oscar Najera <hi@oscarnajera.com>	2023-12-08 14:13:14 +0100
committer	Oscar Najera <hi@oscarnajera.com>	2024-01-13 02:27:42 +0100
commit	97d4f907f670b4a16edf972a9339c194822eb476 (patch)
tree	eb18e4c692aabd9364df2193ec88cc6a1a6937cd
parent	2d9291a7f24268ce58a3d98a4ad9a16a05665df0 (diff)
download	dotfiles-97d4f907f670b4a16edf972a9339c194822eb476.tar.gz dotfiles-97d4f907f670b4a16edf972a9339c194822eb476.tar.bz2 dotfiles-97d4f907f670b4a16edf972a9339c194822eb476.zip