CintraAI commited on
Commit
a404cdb
1 Parent(s): 9ede9d1

added support for ts php and ruby

Browse files
Chunker.py CHANGED
@@ -1,6 +1,6 @@
1
  from abc import ABC, abstractmethod
2
  from CodeParser import CodeParser
3
- from Utils import count_tokens
4
 
5
 
6
 
 
1
  from abc import ABC, abstractmethod
2
  from CodeParser import CodeParser
3
+ from utils import count_tokens
4
 
5
 
6
 
CodeParser.py CHANGED
@@ -5,6 +5,12 @@ from tree_sitter import Language, Parser, Node
5
  from typing import Union, List
6
  import logging
7
 
 
 
 
 
 
 
8
 
9
  class CodeParser:
10
  # Added a CACHE_DIR class attribute for caching
@@ -17,7 +23,11 @@ class CodeParser:
17
  "py": "python",
18
  "js": "javascript",
19
  "jsx": "javascript",
20
- "css": "css"
 
 
 
 
21
  }
22
  if file_extensions is None:
23
  self.language_names = []
@@ -28,37 +38,65 @@ class CodeParser:
28
  self._install_parsers()
29
 
30
  def _install_parsers(self):
31
- logging.basicConfig(level=logging.INFO) # Configure logging
32
-
33
- # Ensure cache directory exists
34
- if not os.path.exists(self.CACHE_DIR):
35
- os.makedirs(self.CACHE_DIR)
36
-
37
- # Configure logging to output to the terminal
38
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
39
 
40
- for language in self.language_names:
41
- repo_path = os.path.join(self.CACHE_DIR, f"tree-sitter-{language}")
42
-
43
- if not os.path.exists(repo_path):
44
- clone_command = f"git clone https://github.com/tree-sitter/tree-sitter-{language} {repo_path}"
45
- result = subprocess.run(
46
- clone_command,
47
- shell=True,
48
- stdout=subprocess.PIPE, # Capture standard output
49
- stderr=subprocess.PIPE # Capture standard error
50
- )
51
-
52
- # Check if cloning was successful
53
- if result.returncode != 0:
54
- logging.error(
55
- f"Failed to clone repository for {language}. Command: '{clone_command}'. Error: {result.stderr.decode('utf-8')}")
56
- raise Exception(f"Failed to clone repository for {language}")
57
-
58
- build_path = os.path.join(self.CACHE_DIR, f"build/{language}.so")
59
- Language.build_library(build_path, [repo_path])
60
-
61
- self.languages[language] = Language(build_path, language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  def parse_code(self, code: str, file_extension: str) -> Union[None, Node]:
64
  language_name = self.language_extension_map.get(file_extension)
@@ -112,6 +150,31 @@ class CodeParser:
112
  'function_declaration': 'Function',
113
  'arrow_function': 'Arrow Function',
114
  'statement_block': 'Block',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  }
116
  }
117
 
@@ -119,6 +182,8 @@ class CodeParser:
119
  return node_types[file_extension]
120
  elif file_extension == "jsx":
121
  return node_types["js"]
 
 
122
  else:
123
  raise ValueError("Unsupported file type")
124
 
@@ -135,6 +200,17 @@ class CodeParser:
135
  'js': {
136
  'comment': 'Comment',
137
  'decorator': 'Decorator', # Broadened category
 
 
 
 
 
 
 
 
 
 
 
138
  }
139
  }
140
 
@@ -263,11 +339,4 @@ class CodeParser:
263
  self.map_line_to_node_type(child, line_to_node_type, depth + 1)
264
 
265
  return line_to_node_type
266
-
267
- def print_simple_line_numbers_with_code(self, code: str):
268
-
269
- code_lines = code.split('\n')
270
-
271
- for i, line in enumerate(code_lines):
272
- print(f"Line {i + 1}: {line}")
273
-
 
5
  from typing import Union, List
6
  import logging
7
 
8
+ def return_simple_line_numbers_with_code(code: str) -> str:
9
+ code_lines = code.split('\n')
10
+ code_with_line_numbers = [f"Line {i + 1}: {line}" for i, line in enumerate(code_lines)]
11
+ joined_lines = "\n".join(code_with_line_numbers)
12
+ return joined_lines
13
+
14
 
15
  class CodeParser:
16
  # Added a CACHE_DIR class attribute for caching
 
23
  "py": "python",
24
  "js": "javascript",
25
  "jsx": "javascript",
26
+ "css": "css",
27
+ "ts": "typescript",
28
+ "tsx": "typescript",
29
+ "php": "php",
30
+ "rb": "ruby"
31
  }
32
  if file_extensions is None:
33
  self.language_names = []
 
38
  self._install_parsers()
39
 
40
  def _install_parsers(self):
 
 
 
 
 
 
 
41
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
42
 
43
+ try:
44
+ # Ensure cache directory exists
45
+ if not os.path.exists(self.CACHE_DIR):
46
+ os.makedirs(self.CACHE_DIR)
47
+
48
+ for language in self.language_names:
49
+ repo_path = os.path.join(self.CACHE_DIR, f"tree-sitter-{language}")
50
+
51
+ # Check if the repository exists and contains necessary files
52
+ if not os.path.exists(repo_path) or not self._is_repo_valid(repo_path, language):
53
+ try:
54
+ if os.path.exists(repo_path):
55
+ logging.info(f"Updating existing repository for {language}")
56
+ update_command = f"cd {repo_path} && git pull"
57
+ subprocess.run(update_command, shell=True, check=True)
58
+ else:
59
+ logging.info(f"Cloning repository for {language}")
60
+ clone_command = f"git clone https://github.com/tree-sitter/tree-sitter-{language} {repo_path}"
61
+ subprocess.run(clone_command, shell=True, check=True)
62
+ except subprocess.CalledProcessError as e:
63
+ logging.error(f"Failed to clone/update repository for {language}. Error: {e}")
64
+ continue
65
+
66
+ try:
67
+ build_path = os.path.join(self.CACHE_DIR, f"build/{language}.so")
68
+
69
+ # Special handling for TypeScript
70
+ if language == 'typescript':
71
+ ts_dir = os.path.join(repo_path, 'typescript')
72
+ tsx_dir = os.path.join(repo_path, 'tsx')
73
+ if os.path.exists(ts_dir) and os.path.exists(tsx_dir):
74
+ Language.build_library(build_path, [ts_dir, tsx_dir])
75
+ else:
76
+ raise FileNotFoundError(f"TypeScript or TSX directory not found in {repo_path}")
77
+ if language == 'php':
78
+ php_dir = os.path.join(repo_path, 'php')
79
+ Language.build_library(build_path, [php_dir])
80
+ else:
81
+ Language.build_library(build_path, [repo_path])
82
+
83
+ self.languages[language] = Language(build_path, language)
84
+ logging.info(f"Successfully built and loaded {language} parser")
85
+ except Exception as e:
86
+ logging.error(f"Failed to build or load language {language}. Error: {str(e)}")
87
+
88
+ except Exception as e:
89
+ logging.error(f"An unexpected error occurred during parser installation: {str(e)}")
90
+
91
+ def _is_repo_valid(self, repo_path: str, language: str) -> bool:
92
+ """Check if the repository contains necessary files."""
93
+ if language == 'typescript':
94
+ return (os.path.exists(os.path.join(repo_path, 'typescript', 'src', 'parser.c')) and
95
+ os.path.exists(os.path.join(repo_path, 'tsx', 'src', 'parser.c')))
96
+ elif language == 'php':
97
+ return os.path.exists(os.path.join(repo_path, 'php', 'src', 'parser.c'))
98
+ else:
99
+ return os.path.exists(os.path.join(repo_path, 'src', 'parser.c'))
100
 
101
  def parse_code(self, code: str, file_extension: str) -> Union[None, Node]:
102
  language_name = self.language_extension_map.get(file_extension)
 
150
  'function_declaration': 'Function',
151
  'arrow_function': 'Arrow Function',
152
  'statement_block': 'Block',
153
+ },
154
+ 'ts': {
155
+ 'import_statement': 'Import',
156
+ 'export_statement': 'Export',
157
+ 'class_declaration': 'Class',
158
+ 'function_declaration': 'Function',
159
+ 'arrow_function': 'Arrow Function',
160
+ 'statement_block': 'Block',
161
+ 'interface_declaration': 'Interface',
162
+ 'type_alias_declaration': 'Type Alias',
163
+ },
164
+ 'php': {
165
+ 'namespace_definition': 'Namespace',
166
+ 'class_declaration': 'Class',
167
+ 'method_declaration': 'Method',
168
+ 'function_definition': 'Function',
169
+ 'interface_declaration': 'Interface',
170
+ 'trait_declaration': 'Trait',
171
+ },
172
+ 'rb': {
173
+ 'class': 'Class',
174
+ 'method': 'Method',
175
+ 'module': 'Module',
176
+ 'singleton_class': 'Singleton Class',
177
+ 'begin': 'Begin Block',
178
  }
179
  }
180
 
 
182
  return node_types[file_extension]
183
  elif file_extension == "jsx":
184
  return node_types["js"]
185
+ elif file_extension == "tsx":
186
+ return node_types["ts"]
187
  else:
188
  raise ValueError("Unsupported file type")
189
 
 
200
  'js': {
201
  'comment': 'Comment',
202
  'decorator': 'Decorator', # Broadened category
203
+ },
204
+ 'ts': {
205
+ 'comment': 'Comment',
206
+ 'decorator': 'Decorator',
207
+ },
208
+ 'php': {
209
+ 'comment': 'Comment',
210
+ 'attribute': 'Attribute',
211
+ },
212
+ 'rb': {
213
+ 'comment': 'Comment',
214
  }
215
  }
216
 
 
339
  self.map_line_to_node_type(child, line_to_node_type, depth + 1)
340
 
341
  return line_to_node_type
342
+
 
 
 
 
 
 
 
__pycache__/Chunker.cpython-312.pyc ADDED
Binary file (4.86 kB). View file
 
__pycache__/CodeParser.cpython-312.pyc ADDED
Binary file (14.9 kB). View file
 
__pycache__/test_code_chunker.cpython-312.pyc ADDED
Binary file (14.7 kB). View file
 
__pycache__/utils.cpython-312.pyc ADDED
Binary file (896 Bytes). View file
 
requirements.txt CHANGED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ altair==5.2.0
3
+ annotated-types==0.6.0
4
+ anyio==4.3.0
5
+ attrs==23.2.0
6
+ certifi==2024.2.2
7
+ charset-normalizer==3.3.2
8
+ click==8.1.7
9
+ colorama==0.4.6
10
+ contourpy==1.2.0
11
+ cycler==0.12.1
12
+ fastapi==0.110.0
13
+ ffmpy==0.3.2
14
+ filelock==3.13.1
15
+ fonttools==4.49.0
16
+ fsspec==2024.2.0
17
+ gradio==4.19.2
18
+ gradio_client==0.10.1
19
+ h11==0.14.0
20
+ httpcore==1.0.4
21
+ httpx==0.27.0
22
+ huggingface-hub==0.20.3
23
+ idna==3.6
24
+ importlib_resources==6.1.2
25
+ Jinja2==3.1.3
26
+ jsonschema==4.21.1
27
+ jsonschema-specifications==2023.12.1
28
+ kiwisolver==1.4.5
29
+ markdown-it-py==3.0.0
30
+ MarkupSafe==2.1.5
31
+ matplotlib==3.8.3
32
+ mdurl==0.1.2
33
+ numpy==1.26.4
34
+ orjson==3.9.15
35
+ packaging==23.2
36
+ pandas==2.2.1
37
+ pillow==10.2.0
38
+ pydantic==2.6.2
39
+ pydantic_core==2.16.3
40
+ pydub==0.25.1
41
+ Pygments==2.17.2
42
+ pyparsing==3.1.1
43
+ python-dateutil==2.8.2
44
+ python-multipart==0.0.9
45
+ pytz==2024.1
46
+ PyYAML==6.0.1
47
+ referencing==0.33.0
48
+ regex==2023.12.25
49
+ requests==2.31.0
50
+ rich==13.7.0
51
+ rpds-py==0.18.0
52
+ ruff==0.2.2
53
+ semantic-version==2.10.0
54
+ setuptools==69.1.1
55
+ shellingham==1.5.4
56
+ six==1.16.0
57
+ sniffio==1.3.1
58
+ starlette==0.36.3
59
+ tomlkit==0.12.0
60
+ toolz==0.12.1
61
+ tqdm==4.66.2
62
+ tree-sitter==0.20.4
63
+ typer==0.9.0
64
+ typing_extensions==4.10.0
65
+ tzdata==2024.1
66
+ urllib3==2.2.1
67
+ uvicorn==0.27.1
68
+ websockets==11.0.3
test_code_chunker.py CHANGED
@@ -1,13 +1,11 @@
1
  import unittest
2
  from unittest.mock import patch
3
  from Chunker import Chunker, CodeChunker
4
- from utils import load_json
5
  import tiktoken
6
- import json
7
- import os
8
 
9
-
10
 
 
11
  # Mocking the count_tokens function as it's external and not the focus of these tests
12
  def mock_count_tokens(string: str, encoding_name='gpt-4') -> int:
13
  """Returns the number of tokens in a text string."""
@@ -18,7 +16,7 @@ def mock_count_tokens(string: str, encoding_name='gpt-4') -> int:
18
  # Python Test Class
19
  class TestCodeChunkerPython(unittest.TestCase):
20
  def setUp(self):
21
- self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
22
  self.mock_count_tokens = self.patcher.start()
23
  self.code_chunker = CodeChunker(file_extension='py')
24
  self.mock_codebase = load_json('mock_codefiles.json')
 
1
  import unittest
2
  from unittest.mock import patch
3
  from Chunker import Chunker, CodeChunker
 
4
  import tiktoken
5
+ from utils import load_json
 
6
 
 
7
 
8
+
9
  # Mocking the count_tokens function as it's external and not the focus of these tests
10
  def mock_count_tokens(string: str, encoding_name='gpt-4') -> int:
11
  """Returns the number of tokens in a text string."""
 
16
  # Python Test Class
17
  class TestCodeChunkerPython(unittest.TestCase):
18
  def setUp(self):
19
+ self.patcher = patch('utils.count_tokens', side_effect=mock_count_tokens)
20
  self.mock_count_tokens = self.patcher.start()
21
  self.code_chunker = CodeChunker(file_extension='py')
22
  self.mock_codebase = load_json('mock_codefiles.json')