From 97d4f907f670b4a16edf972a9339c194822eb476 Mon Sep 17 00:00:00 2001 From: Oscar Najera Date: Fri, 8 Dec 2023 14:13:14 +0100 Subject: semgrep package --- .gitignore | 1 + scratch/semgrep/parse.py | 109 --------------------------------------- scratch/semgrep/requirements.txt | 2 + scratch/semgrep/test.org | 20 ------- scratch/semgrep/utils.py | 21 ++++++++ 5 files changed, 24 insertions(+), 129 deletions(-) delete mode 100644 scratch/semgrep/parse.py create mode 100644 scratch/semgrep/requirements.txt delete mode 100644 scratch/semgrep/test.org create mode 100644 scratch/semgrep/utils.py diff --git a/.gitignore b/.gitignore index b534581..1862e32 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /elisp/*.elc +/scratch/semgrep/__pycache__/ diff --git a/scratch/semgrep/parse.py b/scratch/semgrep/parse.py deleted file mode 100644 index 3846e3e..0000000 --- a/scratch/semgrep/parse.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env python3 - -import orgparse -import pandas as pd -import re - - -def org_roam_nodes_to_dataframe(org_file): - # Load the org file into an OrgData object - org_data = orgparse.load(org_file) - - # Define a function to extract the title of a node - def extract_title(node): - if node.heading: - # If the node has a heading, return it - return node.heading - else: - # Otherwise, extract the title from the org file using a regular expression - title_pattern = re.compile(r"^#\+title:\s*(.*)$", re.IGNORECASE) - match = title_pattern.search(node.body) - if match: - return match.group(1) - else: - # If the title is not found, extract it from the first line of the body - return re.sub( - r"#\+title:", "", node.body.split("\n")[0], flags=re.IGNORECASE - ).strip() - - # Define a function to recursively extract the bodies of a node and its descendants - def extract_node_nested_body(node): - body = node.body - for child in node.children: - body += ( - "\n" - + child.level * "*" - + " " - + child.heading - + "\n" - + extract_node_nested_body(child) - ) - return body.strip() - - # Define a function to recursively extract the bodies of a node - # and its descendants when they are not other nodes - def extract_node_nested_body_exclusive(node): - body = node.body - for child in node.children: - if not child.properties.get("ID") and not child.properties.get("SEARCH"): - body += ( - "\n" - + child.level * "*" - + " " - + child.heading - + "\n" - + extract_node_nested_body_exclusive(child) - ) - return body.strip() - - # Define a function to build the hierarchy of a node - def build_node_hierarchy(node): - hierarchy = [extract_title(node)] - parent = node.parent - - # while parent and parent != org_data[0]: - while parent: - hierarchy.append(extract_title(parent)) - parent = parent.parent - return " > ".join(reversed(hierarchy)).strip() - - # Define a function to convert a node to a dictionary - def node_to_dict(node, file_name): - node_dict = { - "file_name": file_name, - "node_id": node.properties.get("ID"), - "node_title": extract_title(node), - "node_hierarchy": build_node_hierarchy(node), - "node_text": node.body, - "node_text_nested": extract_node_nested_body(node), - "node_text_nested_exclusive": extract_node_nested_body_exclusive(node), - } - - return node_dict - - # Create a list of all org-roam nodes in the OrgData object - nodes = [ - node_to_dict(node, org_file) - for node in org_data[0][:] - if node.properties.get("ID") - ] - - return pd.DataFrame(nodes) - - -model.encode( - "What you need is the gist. Your mind needs the repetition to absorb the information. The true learning is on *doing* not in the input. If you can't use the material, you don't know. There is just a limited amount you can learn by listening. The rest you must do yourself. Courses give you ideas, you must figure out what to do with them." -) -model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1") - -query_embedding = model.encode("How big is London") -passage_embedding = model.encode( - [ - "London has 9,787,426 inhabitants at the 2011 census", - "London is known for its finacial district", - "London is full of criminals", - "Cairo is small", - ] -) - -print("Similarity:", util.dot_score(query_embedding, passage_embedding)) diff --git a/scratch/semgrep/requirements.txt b/scratch/semgrep/requirements.txt new file mode 100644 index 0000000..3e493ea --- /dev/null +++ b/scratch/semgrep/requirements.txt @@ -0,0 +1,2 @@ +sentence-transformers +chromadb diff --git a/scratch/semgrep/test.org b/scratch/semgrep/test.org deleted file mode 100644 index aac16d0..0000000 --- a/scratch/semgrep/test.org +++ /dev/null @@ -1,20 +0,0 @@ -#+title: Test - -It has started all wars. - -No more data. It is a case of food suply. No more data. It is a case of food -supply and nothing else for the. - -How an that - -#+begin_src emacs-lisp :results value raw -(json-encode - (org-element-map - (org-element-parse-buffer) - 'paragraph - (lambda (hl) - (car - (org-element-contents hl))))) -#+end_src - -#+RESULTS: diff --git a/scratch/semgrep/utils.py b/scratch/semgrep/utils.py new file mode 100644 index 0000000..66ee410 --- /dev/null +++ b/scratch/semgrep/utils.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# This functions don't necessarily stand alone. I used on debug process +import uuid + +response = collection.query( + query_texts="machines scale", + where={"node-id": "496d4874-be24-4601-8a87-214d55e11297"}, +) +collection.query(query_texts="machines scale") + + +def get_node(node_id): + return collection.get(where={"node-id": node_id}) + + +def get_data(amount): + nodeid = str(uuid.uuid4()) + data = ["heelo" for _ in range(amount)] + metadata = [{"node-id": nodeid, "point": i} for i in range(amount)] + ids = [str(uuid.uuid4()) for _ in range(amount)] + return {"documents": data, "metadatas": metadata, "ids": ids} -- cgit v1.2.3