aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOscar Najera <hi@oscarnajera.com>2023-12-08 14:13:14 +0100
committerOscar Najera <hi@oscarnajera.com>2024-01-13 02:27:42 +0100
commit97d4f907f670b4a16edf972a9339c194822eb476 (patch)
treeeb18e4c692aabd9364df2193ec88cc6a1a6937cd
parent2d9291a7f24268ce58a3d98a4ad9a16a05665df0 (diff)
downloaddotfiles-97d4f907f670b4a16edf972a9339c194822eb476.tar.gz
dotfiles-97d4f907f670b4a16edf972a9339c194822eb476.tar.bz2
dotfiles-97d4f907f670b4a16edf972a9339c194822eb476.zip
semgrep package
-rw-r--r--.gitignore1
-rw-r--r--scratch/semgrep/parse.py109
-rw-r--r--scratch/semgrep/requirements.txt2
-rw-r--r--scratch/semgrep/test.org20
-rw-r--r--scratch/semgrep/utils.py21
5 files changed, 24 insertions, 129 deletions
diff --git a/.gitignore b/.gitignore
index b534581..1862e32 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
/elisp/*.elc
+/scratch/semgrep/__pycache__/
diff --git a/scratch/semgrep/parse.py b/scratch/semgrep/parse.py
deleted file mode 100644
index 3846e3e..0000000
--- a/scratch/semgrep/parse.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/usr/bin/env python3
-
-import orgparse
-import pandas as pd
-import re
-
-
-def org_roam_nodes_to_dataframe(org_file):
- # Load the org file into an OrgData object
- org_data = orgparse.load(org_file)
-
- # Define a function to extract the title of a node
- def extract_title(node):
- if node.heading:
- # If the node has a heading, return it
- return node.heading
- else:
- # Otherwise, extract the title from the org file using a regular expression
- title_pattern = re.compile(r"^#\+title:\s*(.*)$", re.IGNORECASE)
- match = title_pattern.search(node.body)
- if match:
- return match.group(1)
- else:
- # If the title is not found, extract it from the first line of the body
- return re.sub(
- r"#\+title:", "", node.body.split("\n")[0], flags=re.IGNORECASE
- ).strip()
-
- # Define a function to recursively extract the bodies of a node and its descendants
- def extract_node_nested_body(node):
- body = node.body
- for child in node.children:
- body += (
- "\n"
- + child.level * "*"
- + " "
- + child.heading
- + "\n"
- + extract_node_nested_body(child)
- )
- return body.strip()
-
- # Define a function to recursively extract the bodies of a node
- # and its descendants when they are not other nodes
- def extract_node_nested_body_exclusive(node):
- body = node.body
- for child in node.children:
- if not child.properties.get("ID") and not child.properties.get("SEARCH"):
- body += (
- "\n"
- + child.level * "*"
- + " "
- + child.heading
- + "\n"
- + extract_node_nested_body_exclusive(child)
- )
- return body.strip()
-
- # Define a function to build the hierarchy of a node
- def build_node_hierarchy(node):
- hierarchy = [extract_title(node)]
- parent = node.parent
-
- # while parent and parent != org_data[0]:
- while parent:
- hierarchy.append(extract_title(parent))
- parent = parent.parent
- return " > ".join(reversed(hierarchy)).strip()
-
- # Define a function to convert a node to a dictionary
- def node_to_dict(node, file_name):
- node_dict = {
- "file_name": file_name,
- "node_id": node.properties.get("ID"),
- "node_title": extract_title(node),
- "node_hierarchy": build_node_hierarchy(node),
- "node_text": node.body,
- "node_text_nested": extract_node_nested_body(node),
- "node_text_nested_exclusive": extract_node_nested_body_exclusive(node),
- }
-
- return node_dict
-
- # Create a list of all org-roam nodes in the OrgData object
- nodes = [
- node_to_dict(node, org_file)
- for node in org_data[0][:]
- if node.properties.get("ID")
- ]
-
- return pd.DataFrame(nodes)
-
-
-model.encode(
- "What you need is the gist. Your mind needs the repetition to absorb the information. The true learning is on *doing* not in the input. If you can't use the material, you don't know. There is just a limited amount you can learn by listening. The rest you must do yourself. Courses give you ideas, you must figure out what to do with them."
-)
-model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
-
-query_embedding = model.encode("How big is London")
-passage_embedding = model.encode(
- [
- "London has 9,787,426 inhabitants at the 2011 census",
- "London is known for its finacial district",
- "London is full of criminals",
- "Cairo is small",
- ]
-)
-
-print("Similarity:", util.dot_score(query_embedding, passage_embedding))
diff --git a/scratch/semgrep/requirements.txt b/scratch/semgrep/requirements.txt
new file mode 100644
index 0000000..3e493ea
--- /dev/null
+++ b/scratch/semgrep/requirements.txt
@@ -0,0 +1,2 @@
+sentence-transformers
+chromadb
diff --git a/scratch/semgrep/test.org b/scratch/semgrep/test.org
deleted file mode 100644
index aac16d0..0000000
--- a/scratch/semgrep/test.org
+++ /dev/null
@@ -1,20 +0,0 @@
-#+title: Test
-
-It has started all wars.
-
-No more data. It is a case of food suply. No more data. It is a case of food
-supply and nothing else for the.
-
-How an that
-
-#+begin_src emacs-lisp :results value raw
-(json-encode
- (org-element-map
- (org-element-parse-buffer)
- 'paragraph
- (lambda (hl)
- (car
- (org-element-contents hl)))))
-#+end_src
-
-#+RESULTS:
diff --git a/scratch/semgrep/utils.py b/scratch/semgrep/utils.py
new file mode 100644
index 0000000..66ee410
--- /dev/null
+++ b/scratch/semgrep/utils.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+# This functions don't necessarily stand alone. I used on debug process
+import uuid
+
+response = collection.query(
+ query_texts="machines scale",
+ where={"node-id": "496d4874-be24-4601-8a87-214d55e11297"},
+)
+collection.query(query_texts="machines scale")
+
+
+def get_node(node_id):
+ return collection.get(where={"node-id": node_id})
+
+
+def get_data(amount):
+ nodeid = str(uuid.uuid4())
+ data = ["heelo" for _ in range(amount)]
+ metadata = [{"node-id": nodeid, "point": i} for i in range(amount)]
+ ids = [str(uuid.uuid4()) for _ in range(amount)]
+ return {"documents": data, "metadatas": metadata, "ids": ids}