code-chunker / CodeParser.py
CintraAI's picture
added codefiles
a1983fb
raw
history blame
10.1 kB
import os
import subprocess
from typing import List, Dict, Union, Tuple
from tree_sitter import Language, Parser, Node
from typing import Union, List
import logging
class CodeParser:
# Added a CACHE_DIR class attribute for caching
CACHE_DIR = os.path.expanduser("~/.code_parser_cache")
def __init__(self, file_extensions: Union[None, List[str], str] = None):
if isinstance(file_extensions, str):
file_extensions = [file_extensions]
self.language_extension_map = {
"py": "python",
"js": "javascript",
"jsx": "javascript",
"css": "css"
}
if file_extensions is None:
self.language_names = []
else:
self.language_names = [self.language_extension_map.get(ext) for ext in file_extensions if
ext in self.language_extension_map]
self.languages = {}
self._install_parsers()
def _install_parsers(self):
logging.basicConfig(level=logging.INFO) # Configure logging
# Ensure cache directory exists
if not os.path.exists(self.CACHE_DIR):
os.makedirs(self.CACHE_DIR)
# Configure logging to output to the terminal
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
for language in self.language_names:
repo_path = os.path.join(self.CACHE_DIR, f"tree-sitter-{language}")
if not os.path.exists(repo_path):
clone_command = f"git clone https://github.com/tree-sitter/tree-sitter-{language} {repo_path}"
result = subprocess.run(
clone_command,
shell=True,
stdout=subprocess.PIPE, # Capture standard output
stderr=subprocess.PIPE # Capture standard error
)
# Check if cloning was successful
if result.returncode != 0:
logging.error(
f"Failed to clone repository for {language}. Command: '{clone_command}'. Error: {result.stderr.decode('utf-8')}")
raise Exception(f"Failed to clone repository for {language}")
build_path = os.path.join(self.CACHE_DIR, f"build/{language}.so")
Language.build_library(build_path, [repo_path])
self.languages[language] = Language(build_path, language)
def parse_code(self, code: str, file_extension: str) -> Union[None, Node]:
language_name = self.language_extension_map.get(file_extension)
if language_name is None:
print(f"Unsupported file type: {file_extension}")
return None
language = self.languages.get(language_name)
if language is None:
print("Language parser not found")
return None
parser = Parser()
parser.set_language(language)
tree = parser.parse(bytes(code, "utf8"))
if tree is None:
print("Failed to parse the code")
return None
return tree.root_node
def extract_points_of_interest(self, node: Node, file_extension: str) -> List[Tuple[Node, str]]:
node_types_of_interest = self._get_node_types_of_interest(file_extension)
points_of_interest = []
if node.type in node_types_of_interest.keys():
points_of_interest.append((node, node_types_of_interest[node.type]))
for child in node.children:
points_of_interest.extend(self.extract_points_of_interest(child, file_extension))
return points_of_interest
def _get_node_types_of_interest(self, file_extension: str) -> Dict[str, str]:
node_types = {
'py': {
'import_statement': 'Import',
'export_statement': 'Export',
'class_definition': 'Class',
'function_definition': 'Function',
},
'css': {
'tag_name': 'Tag',
'@media': 'Media Query',
},
'js': {
'import_statement': 'Import',
'export_statement': 'Export',
'class_declaration': 'Class',
'function_declaration': 'Function',
'arrow_function': 'Arrow Function',
'statement_block': 'Block',
}
}
if file_extension in node_types.keys():
return node_types[file_extension]
elif file_extension == "jsx":
return node_types["js"]
else:
raise ValueError("Unsupported file type")
def _get_nodes_for_comments(self, file_extension: str) -> Dict[str, str]:
node_types = {
'py': {
'comment': 'Comment',
'decorator': 'Decorator', # Broadened category
},
'css': {
'comment': 'Comment'
},
'js': {
'comment': 'Comment',
'decorator': 'Decorator', # Broadened category
}
}
if file_extension in node_types.keys():
return node_types[file_extension]
elif file_extension == "jsx":
return node_types["js"]
else:
raise ValueError("Unsupported file type")
def extract_comments(self, node: Node, file_extension: str) -> List[Tuple[Node, str]]:
node_types_of_interest = self._get_nodes_for_comments(file_extension)
comments = []
if node.type in node_types_of_interest:
comments.append((node, node_types_of_interest[node.type]))
for child in node.children:
comments.extend(self.extract_comments(child, file_extension))
return comments
def get_lines_for_points_of_interest(self, code: str, file_extension: str) -> List[int]:
language_name = self.language_extension_map.get(file_extension)
if language_name is None:
raise ValueError("Unsupported file type")
language = self.languages.get(language_name)
if language is None:
raise ValueError("Language parser not found")
parser = Parser()
parser.set_language(language)
tree = parser.parse(bytes(code, "utf8"))
root_node = tree.root_node
points_of_interest = self.extract_points_of_interest(root_node, file_extension)
line_numbers_with_type_of_interest = {}
for node, type_of_interest in points_of_interest:
start_line = node.start_point[0]
if type_of_interest not in line_numbers_with_type_of_interest:
line_numbers_with_type_of_interest[type_of_interest] = []
if start_line not in line_numbers_with_type_of_interest[type_of_interest]:
line_numbers_with_type_of_interest[type_of_interest].append(start_line)
lines_of_interest = []
for _, line_numbers in line_numbers_with_type_of_interest.items():
lines_of_interest.extend(line_numbers)
return lines_of_interest
def get_lines_for_comments(self, code: str, file_extension: str) -> List[int]:
language_name = self.language_extension_map.get(file_extension)
if language_name is None:
raise ValueError("Unsupported file type")
language = self.languages.get(language_name)
if language is None:
raise ValueError("Language parser not found")
parser = Parser()
parser.set_language(language)
tree = parser.parse(bytes(code, "utf8"))
root_node = tree.root_node
comments = self.extract_comments(root_node, file_extension)
line_numbers_with_comments = {}
for node, type_of_interest in comments:
start_line = node.start_point[0]
if type_of_interest not in line_numbers_with_comments:
line_numbers_with_comments[type_of_interest] = []
if start_line not in line_numbers_with_comments[type_of_interest]:
line_numbers_with_comments[type_of_interest].append(start_line)
lines_of_interest = []
for _, line_numbers in line_numbers_with_comments.items():
lines_of_interest.extend(line_numbers)
return lines_of_interest
def print_all_line_types(self, code: str, file_extension: str):
language_name = self.language_extension_map.get(file_extension)
if language_name is None:
print(f"Unsupported file type: {file_extension}")
return
language = self.languages.get(language_name)
if language is None:
print("Language parser not found")
return
parser = Parser()
parser.set_language(language)
tree = parser.parse(bytes(code, "utf8"))
root_node = tree.root_node
line_to_node_type = self.map_line_to_node_type(root_node)
code_lines = code.split('\n')
for line_num, node_types in line_to_node_type.items():
line_content = code_lines[line_num - 1] # Adjusting index for zero-based indexing
print(f"line {line_num}: {', '.join(node_types)} | Code: {line_content}")
def map_line_to_node_type(self, node, line_to_node_type=None, depth=0):
if line_to_node_type is None:
line_to_node_type = {}
start_line = node.start_point[0] + 1 # Tree-sitter lines are 0-indexed; converting to 1-indexed
# Only add the node type if it's the start line of the node
if start_line not in line_to_node_type:
line_to_node_type[start_line] = []
line_to_node_type[start_line].append(node.type)
for child in node.children:
self.map_line_to_node_type(child, line_to_node_type, depth + 1)
return line_to_node_type
def print_simple_line_numbers_with_code(self, code: str):
code_lines = code.split('\n')
for i, line in enumerate(code_lines):
print(f"Line {i + 1}: {line}")