scratch sem grep

author: Oscar Najera <hi@oscarnajera.com> 2023-07-31 21:51:44 +0200
committer: Oscar Najera <hi@oscarnajera.com> 2024-01-13 02:27:42 +0100
commit: 20fb1a21587dc72d5e314119d6986fd3f2a45f13 (patch)
tree: 085993f4209c373733412dc334296558c9794e9a
parent: 88fead2e2b6082e88d58a1af7c0afa1765a25253 (diff)
download: dotfiles-20fb1a21587dc72d5e314119d6986fd3f2a45f13.tar.gz
dotfiles-20fb1a21587dc72d5e314119d6986fd3f2a45f13.tar.bz2
dotfiles-20fb1a21587dc72d5e314119d6986fd3f2a45f13.zip
2 files changed, 129 insertions, 0 deletions
diff --git a/scratch/semgrep/parse.py b/scratch/semgrep/parse.py
new file mode 100644
index 0000000..3846e3e
--- /dev/null
+++ b/scratch/semgrep/parse.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+import orgparse
+import pandas as pd
+import re
+
+
+def org_roam_nodes_to_dataframe(org_file):
+    # Load the org file into an OrgData object
+    org_data = orgparse.load(org_file)
+
+    # Define a function to extract the title of a node
+    def extract_title(node):
+        if node.heading:
+            # If the node has a heading, return it
+            return node.heading
+        else:
+            # Otherwise, extract the title from the org file using a regular expression
+            title_pattern = re.compile(r"^#\+title:\s*(.*)$", re.IGNORECASE)
+            match = title_pattern.search(node.body)
+            if match:
+                return match.group(1)
+            else:
+                # If the title is not found, extract it from the first line of the body
+                return re.sub(
+                    r"#\+title:", "", node.body.split("\n")[0], flags=re.IGNORECASE
+                ).strip()
+
+    # Define a function to recursively extract the bodies of a node and its descendants
+    def extract_node_nested_body(node):
+        body = node.body
+        for child in node.children:
+            body += (
+                "\n"
+                + child.level * "*"
+                + " "
+                + child.heading
+                + "\n"
+                + extract_node_nested_body(child)
+            )
+        return body.strip()
+
+    # Define a function to recursively extract the bodies of a node
+    # and its descendants when they are not other nodes
+    def extract_node_nested_body_exclusive(node):
+        body = node.body
+        for child in node.children:
+            if not child.properties.get("ID") and not child.properties.get("SEARCH"):
+                body += (
+                    "\n"
+                    + child.level * "*"
+                    + " "
+                    + child.heading
+                    + "\n"
+                    + extract_node_nested_body_exclusive(child)
+                )
+        return body.strip()
+
+    # Define a function to build the hierarchy of a node
+    def build_node_hierarchy(node):
+        hierarchy = [extract_title(node)]
+        parent = node.parent
+
+        # while parent and parent != org_data[0]:
+        while parent:
+            hierarchy.append(extract_title(parent))
+            parent = parent.parent
+        return " > ".join(reversed(hierarchy)).strip()
+
+    # Define a function to convert a node to a dictionary
+    def node_to_dict(node, file_name):
+        node_dict = {
+            "file_name": file_name,
+            "node_id": node.properties.get("ID"),
+            "node_title": extract_title(node),
+            "node_hierarchy": build_node_hierarchy(node),
+            "node_text": node.body,
+            "node_text_nested": extract_node_nested_body(node),
+            "node_text_nested_exclusive": extract_node_nested_body_exclusive(node),
+        }
+
+        return node_dict
+
+    # Create a list of all org-roam nodes in the OrgData object
+    nodes = [
+        node_to_dict(node, org_file)
+        for node in org_data[0][:]
+        if node.properties.get("ID")
+    ]
+
+    return pd.DataFrame(nodes)
+
+
+model.encode(
+    "What you need is the gist. Your mind needs the repetition to absorb the information. The true learning is on *doing* not in the input. If you can't use the material, you don't know. There is just a limited amount you can learn by listening. The rest you must do yourself. Courses give you ideas, you must figure out what to do with them."
+)
+model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
+
+query_embedding = model.encode("How big is London")
+passage_embedding = model.encode(
+    [
+        "London has 9,787,426 inhabitants at the 2011 census",
+        "London is known for its finacial district",
+        "London is full of criminals",
+        "Cairo is small",
+    ]
+)
+
+print("Similarity:", util.dot_score(query_embedding, passage_embedding))
diff --git a/scratch/semgrep/test.org b/scratch/semgrep/test.org
new file mode 100644
index 0000000..aac16d0
--- /dev/null
+++ b/scratch/semgrep/test.org
@@ -0,0 +1,20 @@
+#+title: Test
+
+It has started all wars.
+
+No more data. It is a case of food suply. No more data. It is a case of food
+supply and nothing else for the.
+
+How an that
+
+#+begin_src emacs-lisp :results value raw
+(json-encode
+ (org-element-map
+     (org-element-parse-buffer)
+     'paragraph
+   (lambda (hl)
+     (car
+      (org-element-contents hl)))))
+#+end_src
+
+#+RESULTS:
author	Oscar Najera <hi@oscarnajera.com>	2023-07-31 21:51:44 +0200
committer	Oscar Najera <hi@oscarnajera.com>	2024-01-13 02:27:42 +0100
commit	20fb1a21587dc72d5e314119d6986fd3f2a45f13 (patch)
tree	085993f4209c373733412dc334296558c9794e9a
parent	88fead2e2b6082e88d58a1af7c0afa1765a25253 (diff)
download	dotfiles-20fb1a21587dc72d5e314119d6986fd3f2a45f13.tar.gz dotfiles-20fb1a21587dc72d5e314119d6986fd3f2a45f13.tar.bz2 dotfiles-20fb1a21587dc72d5e314119d6986fd3f2a45f13.zip