Spaces:
Running
Running
added codefiles
Browse files- .vscode/settings.json +11 -0
- Chunker.py +124 -0
- CodeParser.py +273 -0
- mock_codefiles.json +27 -0
- requirements.txt +0 -0
- test_code_chunker.py +214 -0
- utils.py +14 -0
.vscode/settings.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"python.testing.unittestArgs": [
|
3 |
+
"-v",
|
4 |
+
"-s",
|
5 |
+
".",
|
6 |
+
"-p",
|
7 |
+
"test_*.py"
|
8 |
+
],
|
9 |
+
"python.testing.pytestEnabled": false,
|
10 |
+
"python.testing.unittestEnabled": true
|
11 |
+
}
|
Chunker.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from CodeParser import CodeParser
|
3 |
+
from Utils import count_tokens
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
class Chunker(ABC):
|
8 |
+
def __init__(self, encoding_name="gpt-4"):
|
9 |
+
self.encoding_name = encoding_name
|
10 |
+
|
11 |
+
@abstractmethod
|
12 |
+
def chunk(self, content, token_limit):
|
13 |
+
pass
|
14 |
+
|
15 |
+
@abstractmethod
|
16 |
+
def get_chunk(self, chunked_content, chunk_number):
|
17 |
+
pass
|
18 |
+
|
19 |
+
@staticmethod
|
20 |
+
def print_chunks(chunks):
|
21 |
+
for chunk_number, chunk_code in chunks.items():
|
22 |
+
print(f"Chunk {chunk_number}:")
|
23 |
+
print("="*40)
|
24 |
+
print(chunk_code)
|
25 |
+
print("="*40)
|
26 |
+
|
27 |
+
@staticmethod
|
28 |
+
def consolidate_chunks_into_file(chunks):
|
29 |
+
return "\n".join(chunks.values())
|
30 |
+
|
31 |
+
@staticmethod
|
32 |
+
def count_lines(consolidated_chunks):
|
33 |
+
lines = consolidated_chunks.split("\n")
|
34 |
+
return len(lines)
|
35 |
+
|
36 |
+
class CodeChunker(Chunker):
|
37 |
+
def __init__(self, file_extension, encoding_name="gpt-4"):
|
38 |
+
super().__init__(encoding_name)
|
39 |
+
self.file_extension = file_extension
|
40 |
+
|
41 |
+
def chunk(self, code, token_limit) -> dict:
|
42 |
+
code_parser = CodeParser(self.file_extension)
|
43 |
+
chunks = {}
|
44 |
+
current_chunk = ""
|
45 |
+
token_count = 0
|
46 |
+
lines = code.split("\n")
|
47 |
+
i = 0
|
48 |
+
chunk_number = 1
|
49 |
+
start_line = 0
|
50 |
+
breakpoints = sorted(code_parser.get_lines_for_points_of_interest(code, self.file_extension))
|
51 |
+
comments = sorted(code_parser.get_lines_for_comments(code, self.file_extension))
|
52 |
+
adjusted_breakpoints = []
|
53 |
+
for bp in breakpoints:
|
54 |
+
current_line = bp - 1
|
55 |
+
highest_comment_line = None # Initialize with None to indicate no comment line has been found yet
|
56 |
+
while current_line in comments:
|
57 |
+
highest_comment_line = current_line # Update highest comment line found
|
58 |
+
current_line -= 1 # Move to the previous line
|
59 |
+
|
60 |
+
if highest_comment_line: # If a highest comment line exists, add it
|
61 |
+
adjusted_breakpoints.append(highest_comment_line)
|
62 |
+
else:
|
63 |
+
adjusted_breakpoints.append(bp) # If no comments were found before the breakpoint, add the original breakpoint
|
64 |
+
|
65 |
+
breakpoints = sorted(set(adjusted_breakpoints)) # Ensure breakpoints are unique and sorted
|
66 |
+
|
67 |
+
while i < len(lines):
|
68 |
+
line = lines[i]
|
69 |
+
new_token_count = count_tokens(line, self.encoding_name)
|
70 |
+
if token_count + new_token_count > token_limit:
|
71 |
+
|
72 |
+
# Set the stop line to the last breakpoint before the current line
|
73 |
+
if i in breakpoints:
|
74 |
+
stop_line = i
|
75 |
+
else:
|
76 |
+
stop_line = max(max([x for x in breakpoints if x < i], default=start_line), start_line)
|
77 |
+
|
78 |
+
# If the stop line is the same as the start line, it means we haven't reached a breakpoint yet and we need to move to the next line to find one
|
79 |
+
if stop_line == start_line and i not in breakpoints:
|
80 |
+
token_count += new_token_count
|
81 |
+
i += 1
|
82 |
+
|
83 |
+
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
84 |
+
elif stop_line == start_line and i == stop_line:
|
85 |
+
token_count += new_token_count
|
86 |
+
i += 1
|
87 |
+
|
88 |
+
|
89 |
+
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
90 |
+
elif stop_line == start_line and i in breakpoints:
|
91 |
+
current_chunk = "\n".join(lines[start_line:stop_line])
|
92 |
+
if current_chunk.strip(): # If the current chunk is not just whitespace
|
93 |
+
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
94 |
+
chunk_number += 1
|
95 |
+
|
96 |
+
token_count = 0
|
97 |
+
start_line = i
|
98 |
+
i += 1
|
99 |
+
|
100 |
+
# If the stop line is different from the start line, it means we're at the end of a block
|
101 |
+
else:
|
102 |
+
current_chunk = "\n".join(lines[start_line:stop_line])
|
103 |
+
if current_chunk.strip():
|
104 |
+
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
105 |
+
chunk_number += 1
|
106 |
+
|
107 |
+
i = stop_line
|
108 |
+
token_count = 0
|
109 |
+
start_line = stop_line
|
110 |
+
else:
|
111 |
+
# If the token count is still within the limit, add the line to the current chunk
|
112 |
+
token_count += new_token_count
|
113 |
+
i += 1
|
114 |
+
|
115 |
+
# Append remaining code, if any, ensuring it's not empty or whitespace
|
116 |
+
current_chunk_code = "\n".join(lines[start_line:])
|
117 |
+
if current_chunk_code.strip(): # Checks if the chunk is not just whitespace
|
118 |
+
chunks[chunk_number] = current_chunk_code # Using chunk_number as key
|
119 |
+
|
120 |
+
return chunks
|
121 |
+
|
122 |
+
def get_chunk(self, chunked_codebase, chunk_number):
|
123 |
+
return chunked_codebase[chunk_number]
|
124 |
+
|
CodeParser.py
ADDED
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
from typing import List, Dict, Union, Tuple
|
4 |
+
from tree_sitter import Language, Parser, Node
|
5 |
+
from typing import Union, List
|
6 |
+
import logging
|
7 |
+
|
8 |
+
|
9 |
+
class CodeParser:
|
10 |
+
# Added a CACHE_DIR class attribute for caching
|
11 |
+
CACHE_DIR = os.path.expanduser("~/.code_parser_cache")
|
12 |
+
|
13 |
+
def __init__(self, file_extensions: Union[None, List[str], str] = None):
|
14 |
+
if isinstance(file_extensions, str):
|
15 |
+
file_extensions = [file_extensions]
|
16 |
+
self.language_extension_map = {
|
17 |
+
"py": "python",
|
18 |
+
"js": "javascript",
|
19 |
+
"jsx": "javascript",
|
20 |
+
"css": "css"
|
21 |
+
}
|
22 |
+
if file_extensions is None:
|
23 |
+
self.language_names = []
|
24 |
+
else:
|
25 |
+
self.language_names = [self.language_extension_map.get(ext) for ext in file_extensions if
|
26 |
+
ext in self.language_extension_map]
|
27 |
+
self.languages = {}
|
28 |
+
self._install_parsers()
|
29 |
+
|
30 |
+
def _install_parsers(self):
|
31 |
+
logging.basicConfig(level=logging.INFO) # Configure logging
|
32 |
+
|
33 |
+
# Ensure cache directory exists
|
34 |
+
if not os.path.exists(self.CACHE_DIR):
|
35 |
+
os.makedirs(self.CACHE_DIR)
|
36 |
+
|
37 |
+
# Configure logging to output to the terminal
|
38 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
39 |
+
|
40 |
+
for language in self.language_names:
|
41 |
+
repo_path = os.path.join(self.CACHE_DIR, f"tree-sitter-{language}")
|
42 |
+
|
43 |
+
if not os.path.exists(repo_path):
|
44 |
+
clone_command = f"git clone https://github.com/tree-sitter/tree-sitter-{language} {repo_path}"
|
45 |
+
result = subprocess.run(
|
46 |
+
clone_command,
|
47 |
+
shell=True,
|
48 |
+
stdout=subprocess.PIPE, # Capture standard output
|
49 |
+
stderr=subprocess.PIPE # Capture standard error
|
50 |
+
)
|
51 |
+
|
52 |
+
# Check if cloning was successful
|
53 |
+
if result.returncode != 0:
|
54 |
+
logging.error(
|
55 |
+
f"Failed to clone repository for {language}. Command: '{clone_command}'. Error: {result.stderr.decode('utf-8')}")
|
56 |
+
raise Exception(f"Failed to clone repository for {language}")
|
57 |
+
|
58 |
+
build_path = os.path.join(self.CACHE_DIR, f"build/{language}.so")
|
59 |
+
Language.build_library(build_path, [repo_path])
|
60 |
+
|
61 |
+
self.languages[language] = Language(build_path, language)
|
62 |
+
|
63 |
+
def parse_code(self, code: str, file_extension: str) -> Union[None, Node]:
|
64 |
+
language_name = self.language_extension_map.get(file_extension)
|
65 |
+
if language_name is None:
|
66 |
+
print(f"Unsupported file type: {file_extension}")
|
67 |
+
return None
|
68 |
+
|
69 |
+
language = self.languages.get(language_name)
|
70 |
+
if language is None:
|
71 |
+
print("Language parser not found")
|
72 |
+
return None
|
73 |
+
|
74 |
+
parser = Parser()
|
75 |
+
parser.set_language(language)
|
76 |
+
tree = parser.parse(bytes(code, "utf8"))
|
77 |
+
|
78 |
+
if tree is None:
|
79 |
+
print("Failed to parse the code")
|
80 |
+
return None
|
81 |
+
|
82 |
+
return tree.root_node
|
83 |
+
|
84 |
+
def extract_points_of_interest(self, node: Node, file_extension: str) -> List[Tuple[Node, str]]:
|
85 |
+
node_types_of_interest = self._get_node_types_of_interest(file_extension)
|
86 |
+
|
87 |
+
points_of_interest = []
|
88 |
+
if node.type in node_types_of_interest.keys():
|
89 |
+
points_of_interest.append((node, node_types_of_interest[node.type]))
|
90 |
+
|
91 |
+
for child in node.children:
|
92 |
+
points_of_interest.extend(self.extract_points_of_interest(child, file_extension))
|
93 |
+
|
94 |
+
return points_of_interest
|
95 |
+
|
96 |
+
def _get_node_types_of_interest(self, file_extension: str) -> Dict[str, str]:
|
97 |
+
node_types = {
|
98 |
+
'py': {
|
99 |
+
'import_statement': 'Import',
|
100 |
+
'export_statement': 'Export',
|
101 |
+
'class_definition': 'Class',
|
102 |
+
'function_definition': 'Function',
|
103 |
+
},
|
104 |
+
'css': {
|
105 |
+
'tag_name': 'Tag',
|
106 |
+
'@media': 'Media Query',
|
107 |
+
},
|
108 |
+
'js': {
|
109 |
+
'import_statement': 'Import',
|
110 |
+
'export_statement': 'Export',
|
111 |
+
'class_declaration': 'Class',
|
112 |
+
'function_declaration': 'Function',
|
113 |
+
'arrow_function': 'Arrow Function',
|
114 |
+
'statement_block': 'Block',
|
115 |
+
}
|
116 |
+
}
|
117 |
+
|
118 |
+
if file_extension in node_types.keys():
|
119 |
+
return node_types[file_extension]
|
120 |
+
elif file_extension == "jsx":
|
121 |
+
return node_types["js"]
|
122 |
+
else:
|
123 |
+
raise ValueError("Unsupported file type")
|
124 |
+
|
125 |
+
|
126 |
+
def _get_nodes_for_comments(self, file_extension: str) -> Dict[str, str]:
|
127 |
+
node_types = {
|
128 |
+
'py': {
|
129 |
+
'comment': 'Comment',
|
130 |
+
'decorator': 'Decorator', # Broadened category
|
131 |
+
},
|
132 |
+
'css': {
|
133 |
+
'comment': 'Comment'
|
134 |
+
},
|
135 |
+
'js': {
|
136 |
+
'comment': 'Comment',
|
137 |
+
'decorator': 'Decorator', # Broadened category
|
138 |
+
}
|
139 |
+
}
|
140 |
+
|
141 |
+
if file_extension in node_types.keys():
|
142 |
+
return node_types[file_extension]
|
143 |
+
elif file_extension == "jsx":
|
144 |
+
return node_types["js"]
|
145 |
+
else:
|
146 |
+
raise ValueError("Unsupported file type")
|
147 |
+
|
148 |
+
def extract_comments(self, node: Node, file_extension: str) -> List[Tuple[Node, str]]:
|
149 |
+
node_types_of_interest = self._get_nodes_for_comments(file_extension)
|
150 |
+
|
151 |
+
comments = []
|
152 |
+
if node.type in node_types_of_interest:
|
153 |
+
comments.append((node, node_types_of_interest[node.type]))
|
154 |
+
|
155 |
+
for child in node.children:
|
156 |
+
comments.extend(self.extract_comments(child, file_extension))
|
157 |
+
|
158 |
+
return comments
|
159 |
+
|
160 |
+
def get_lines_for_points_of_interest(self, code: str, file_extension: str) -> List[int]:
|
161 |
+
language_name = self.language_extension_map.get(file_extension)
|
162 |
+
if language_name is None:
|
163 |
+
raise ValueError("Unsupported file type")
|
164 |
+
|
165 |
+
language = self.languages.get(language_name)
|
166 |
+
if language is None:
|
167 |
+
raise ValueError("Language parser not found")
|
168 |
+
|
169 |
+
parser = Parser()
|
170 |
+
parser.set_language(language)
|
171 |
+
|
172 |
+
tree = parser.parse(bytes(code, "utf8"))
|
173 |
+
|
174 |
+
root_node = tree.root_node
|
175 |
+
points_of_interest = self.extract_points_of_interest(root_node, file_extension)
|
176 |
+
|
177 |
+
line_numbers_with_type_of_interest = {}
|
178 |
+
|
179 |
+
for node, type_of_interest in points_of_interest:
|
180 |
+
start_line = node.start_point[0]
|
181 |
+
if type_of_interest not in line_numbers_with_type_of_interest:
|
182 |
+
line_numbers_with_type_of_interest[type_of_interest] = []
|
183 |
+
|
184 |
+
if start_line not in line_numbers_with_type_of_interest[type_of_interest]:
|
185 |
+
line_numbers_with_type_of_interest[type_of_interest].append(start_line)
|
186 |
+
|
187 |
+
lines_of_interest = []
|
188 |
+
for _, line_numbers in line_numbers_with_type_of_interest.items():
|
189 |
+
lines_of_interest.extend(line_numbers)
|
190 |
+
|
191 |
+
return lines_of_interest
|
192 |
+
|
193 |
+
def get_lines_for_comments(self, code: str, file_extension: str) -> List[int]:
|
194 |
+
language_name = self.language_extension_map.get(file_extension)
|
195 |
+
if language_name is None:
|
196 |
+
raise ValueError("Unsupported file type")
|
197 |
+
|
198 |
+
language = self.languages.get(language_name)
|
199 |
+
if language is None:
|
200 |
+
raise ValueError("Language parser not found")
|
201 |
+
|
202 |
+
parser = Parser()
|
203 |
+
parser.set_language(language)
|
204 |
+
|
205 |
+
tree = parser.parse(bytes(code, "utf8"))
|
206 |
+
|
207 |
+
root_node = tree.root_node
|
208 |
+
comments = self.extract_comments(root_node, file_extension)
|
209 |
+
|
210 |
+
line_numbers_with_comments = {}
|
211 |
+
|
212 |
+
for node, type_of_interest in comments:
|
213 |
+
start_line = node.start_point[0]
|
214 |
+
if type_of_interest not in line_numbers_with_comments:
|
215 |
+
line_numbers_with_comments[type_of_interest] = []
|
216 |
+
|
217 |
+
if start_line not in line_numbers_with_comments[type_of_interest]:
|
218 |
+
line_numbers_with_comments[type_of_interest].append(start_line)
|
219 |
+
|
220 |
+
lines_of_interest = []
|
221 |
+
for _, line_numbers in line_numbers_with_comments.items():
|
222 |
+
lines_of_interest.extend(line_numbers)
|
223 |
+
|
224 |
+
return lines_of_interest
|
225 |
+
|
226 |
+
def print_all_line_types(self, code: str, file_extension: str):
|
227 |
+
language_name = self.language_extension_map.get(file_extension)
|
228 |
+
if language_name is None:
|
229 |
+
print(f"Unsupported file type: {file_extension}")
|
230 |
+
return
|
231 |
+
|
232 |
+
language = self.languages.get(language_name)
|
233 |
+
if language is None:
|
234 |
+
print("Language parser not found")
|
235 |
+
return
|
236 |
+
|
237 |
+
parser = Parser()
|
238 |
+
parser.set_language(language)
|
239 |
+
tree = parser.parse(bytes(code, "utf8"))
|
240 |
+
|
241 |
+
root_node = tree.root_node
|
242 |
+
line_to_node_type = self.map_line_to_node_type(root_node)
|
243 |
+
|
244 |
+
code_lines = code.split('\n')
|
245 |
+
|
246 |
+
for line_num, node_types in line_to_node_type.items():
|
247 |
+
line_content = code_lines[line_num - 1] # Adjusting index for zero-based indexing
|
248 |
+
print(f"line {line_num}: {', '.join(node_types)} | Code: {line_content}")
|
249 |
+
|
250 |
+
|
251 |
+
def map_line_to_node_type(self, node, line_to_node_type=None, depth=0):
|
252 |
+
if line_to_node_type is None:
|
253 |
+
line_to_node_type = {}
|
254 |
+
|
255 |
+
start_line = node.start_point[0] + 1 # Tree-sitter lines are 0-indexed; converting to 1-indexed
|
256 |
+
|
257 |
+
# Only add the node type if it's the start line of the node
|
258 |
+
if start_line not in line_to_node_type:
|
259 |
+
line_to_node_type[start_line] = []
|
260 |
+
line_to_node_type[start_line].append(node.type)
|
261 |
+
|
262 |
+
for child in node.children:
|
263 |
+
self.map_line_to_node_type(child, line_to_node_type, depth + 1)
|
264 |
+
|
265 |
+
return line_to_node_type
|
266 |
+
|
267 |
+
def print_simple_line_numbers_with_code(self, code: str):
|
268 |
+
|
269 |
+
code_lines = code.split('\n')
|
270 |
+
|
271 |
+
for i, line in enumerate(code_lines):
|
272 |
+
print(f"Line {i + 1}: {line}")
|
273 |
+
|
mock_codefiles.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"simple.py": "import sys\n\n# This is a sample Python file\n\ndef main():\n print('Hello, world!')\n\nif __name__ == '__main__':\n main()",
|
3 |
+
"text_only.py": "# This file is empty and should test the chunker's ability to handle empty files\n",
|
4 |
+
"routes.py": "from flask import Flask, jsonify, request, redirect, url_for\napp = Flask(__name__)\n\n@app.route('/', methods=['GET'])\ndef home():\n return '<h1>Welcome to the Home Page</h1>', 200\n\n@authenticate # Hypothetical decorator for authentication\n@log_access # Hypothetical decorator for logging access\n@app.route('/api/data', methods=['GET'])\ndef get_data():\n # Simulate fetching data from a database or external service\n data = {'key': 'This is some data'}\n return jsonify(data), 200\n\n@app.route('/api/data/<int:data_id>', methods=['GET'])\ndef get_data_by_id(data_id):\n # Simulate fetching specific data by ID\n data = {'id': data_id, 'value': 'Specific data based on ID'}\n return jsonify(data), 200\n\n@app.route('/api/data', methods=['POST'])\ndef post_data():\n data = request.json\n # Simulate saving data to a database\n return jsonify({'message': 'Data saved successfully', 'data': data}), 201\n\n@app.route('/api/data/<int:data_id>', methods=['PUT'])\ndef update_data(data_id):\n data = request.json\n # Simulate updating data in a database\n return jsonify({'message': 'Data updated successfully', 'id': data_id, 'data': data}), 200\n\n@app.route('/api/data/<int:data_id>', methods=['DELETE'])\ndef delete_data(data_id):\n # Simulate deleting data by ID\n return jsonify({'message': 'Data deleted successfully', 'id': data_id}), 200\n\n@app.route('/redirect', methods=['GET'])\ndef example_redirect():\n return redirect(url_for('home'))\n\nif __name__ == '__main__':\n app.run(debug=True)",
|
5 |
+
"models.py": "from sqlalchemy import Column, Integer, String, ForeignKey\nfrom sqlalchemy.ext.declarative import declarative_base\nfrom sqlalchemy.orm import relationship\n\nBase = declarative_base()\n\nclass User(Base):\n __tablename__ = 'users'\n id = Column(Integer, primary_key=True)\n username = Column(String, unique=True, nullable=False)\n email = Column(String, unique=True, nullable=False)\n\n posts = relationship('Post', backref='author')\n\nclass Post(Base):\n __tablename__ = 'posts'\n id = Column(Integer, primary_key=True)\n title = Column(String, nullable=False)\n content = Column(String, nullable=False)\n user_id = Column(Integer, ForeignKey('users.id'))",
|
6 |
+
"big_class.py": "class BigClass:\n def __init__(self, name, age):\n self.name = name\n self.age = age\n\n def get_name(self):\n return self.name\n\n def get_age(self):\n return self.age\n\n def set_name(self, name):\n self.name = name\n\n def set_age(self, age):\n self.age = age\n\n def __str__(self):\n return f'Name: {self.name}, Age: {self.age}'",
|
7 |
+
"main.py": "from flask import Flask\nfrom routes import app as routes_app\n\n# Create the Flask application\napp = Flask(__name__)\n\n# Register the routes from the routes.py file\napp.register_blueprint(routes_app)\n\n# Configuration settings for the app can go here\napp.config['DEBUG'] = True\napp.config['SECRET_KEY'] = 'your_secret_key'\n\n# More complex app initialization steps can be added here\n# For example, database initialization, login manager setups, etc.\n\n# This function can be used to create a database schema\n def create_database(app):\n if not path.exists('yourdatabase.db'):\n db.create_all(app=app)\n print('Created Database!')\n\nif __name__ == '__main__':\n # Optionally, call database creation or other setup functions here\n create_database(app)\n app.run()",
|
8 |
+
"utilities.py": "import hashlib\nimport uuid\nimport re\nfrom datetime import datetime, timedelta\n\n# Function to hash a password\ndef hash_password(password):\n salt = uuid.uuid4().hex\n return hashlib.sha256(salt.encode() + password.encode()).hexdigest() + ':' + salt\n\n# Function to check a hashed password\ndef check_password(hashed_password, user_password):\n password, salt = hashed_password.split(':')\n return password == hashlib.sha256(salt.encode() + user_password.encode()).hexdigest()\n\n# Function to validate an email address\ndef validate_email(email):\n pattern = r\"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$\"\n return re.match(pattern, email) is not None\n\n# Function to generate a token expiration date\ndef generate_expiration_date(days=1):\n return datetime.now() + timedelta(days=days)\n\n# Function to convert a string to datetime\ndef string_to_datetime(date_string, format='%Y-%m-%d %H:%M:%S'):\n return datetime.strptime(date_string, format)\n\n# Function to convert datetime to string\ndef datetime_to_string(date, format='%Y-%m-%d %H:%M:%S'):\n return date.strftime(format)",
|
9 |
+
"services.py": "import requests\nfrom flask_sqlalchemy import SQLAlchemy\n\n# Assuming an initialized Flask app with SQLAlchemy\ndb = SQLAlchemy()\n\n# Example of a model for database operations\nclass UserData(db.Model):\n id = db.Column(db.Integer, primary_key=True)\n name = db.Column(db.String(100), nullable=False)\n email = db.Column(db.String(100), unique=True, nullable=False)\n\n def __repr__(self):\n return f'<User {self.name}>'\n\n# Function to fetch data from an external API\ndef fetch_external_data(api_url):\n response = requests.get(api_url)\n if response.status_code == 200:\n return response.json()\n else:\n return {'error': 'Failed to fetch data'}\n\n# Function to save user data to the database\ndef save_user_data(name, email):\n new_user = UserData(name=name, email=email)\n db.session.add(new_user)\n try:\n db.session.commit()\n return {'message': 'User saved successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}\n\n# Function to update user data in the database\ndef update_user_data(user_id, name=None, email=None):\n user = UserData.query.get(user_id)\n if not user:\n return {'error': 'User not found'}\n if name:\n user.name = name\n if email:\n user.email = email\n try:\n db.session.commit()\n return {'message': 'User updated successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}\n\n# Function to delete user data from the database\ndef delete_user_data(user_id):\n user = UserData.query.get(user_id)\n if not user:\n return {'error': 'User not found'}\n try:\n db.session.delete(user)\n db.session.commit()\n return {'message': 'User deleted successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}",
|
10 |
+
"simple.js": "// This is a sample JavaScript file\n\nfunction main() {\n console.log('Hello, world!');\n}\n\nmain();",
|
11 |
+
"text_only.js": "// This file is empty and should test the chunker's ability to handle empty files\n",
|
12 |
+
"routes.js": "const express = require('express');\nconst router = express.Router();\n\n// Example of a simple route\nrouter.get('/', (req, res) => {\n res.send('Welcome to the Home Page');\n});\n\n// Example of a route with a parameter\nrouter.get('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n res.json({ id: dataId, value: 'Specific data based on ID' });\n});\n\n// Example of a route that handles POST requests\nrouter.post('/api/data', (req, res) => {\n const data = req.body;\n res.status(201).json({ message: 'Data saved successfully', data: data });\n});\n\n// Example of a route that handles PUT requests\nrouter.put('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n const data = req.body;\n res.json({ message: 'Data updated successfully', id: dataId, data: data });\n});\n\n// Example of a route that handles DELETE requests\nrouter.delete('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n res.json({ message: 'Data deleted successfully', id: dataId });\n});\n\nmodule.exports = router;",
|
13 |
+
"models.js": "const mongoose = require('mongoose');\n\n// Example of a simple Mongoose model\nconst userSchema = new mongoose.Schema({\n username: { type: String, required: true, unique: true },\n email: { type: String, required: true, unique: true }\n});\n\nconst User = mongoose.model('User', userSchema);\n\nmodule.exports = User;",
|
14 |
+
"big_class.js": "// This is a sample JavaScript class with a large number of methods\n\nclass BigClass {\n constructor(name, age) {\n this.name = name;\n this.age = age;\n }\n\n getName() {\n return this.name;\n }\n\n getAge() {\n return this.age;\n }\n\n setName(name) {\n this.name = name;\n }\n\n setAge(age) {\n this.age = age;\n }\n\n toString() {\n return `Name: ${this.name}, Age: ${this.age}`;\n }\n}\n\nmodule.exports = BigClass;",
|
15 |
+
"main.js": "const express = require('express');\nconst routes = require('./routes');\n\n// Create the Express application\nconst app = express();\n\n// Register the routes from the routes.js file\napp.use('/', routes);\n\n// Configuration settings for the app can go here\napp.set('port', process.env.PORT || 3000);\n\n// More complex app initialization steps can be added here\n// For example, database initialization, middleware setups, etc.\n\n// This function can be used to create a database schema\nfunction createDatabase() {\n // Code to create database schema\n console.log('Created Database!');\n}\n\n// Optionally, call database creation or other setup functions here\ncreateDatabase();\n\n// Start the server\napp.listen(app.get('port'), () => {\n console.log(`Server running on port ${app.get('port')}`);\n});",
|
16 |
+
"utilities.js": "// Example of utility functions for common tasks\n\n// Function to hash a password\nfunction hashPassword(password) {\n const salt = uuidv4();\n return sha256(salt + password) + ':' + salt;\n}\n\n// Function to check a hashed password\nfunction checkPassword(hashedPassword, userPassword) {\n const [password, salt] = hashedPassword.split(':');\n return sha256(salt + userPassword) === password;\n}\n\n// Function to validate an email address\nfunction validateEmail(email) {\n const pattern = /^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$/;\n return pattern.test(email);\n}\n\n// Function to generate a token expiration date\nfunction generateExpirationDate(days = 1) {\n const expirationDate = new Date();\n expirationDate.setDate(expirationDate.getDate() + days);\n return expirationDate;\n}\n\n// Function to convert a string to a Date object\nfunction stringToDate(dateString, format = 'YYYY-MM-DD HH:mm:ss') {\n return new Date(dateString);\n}\n\n// Function to convert a Date object to a string\nfunction dateToString(date, format = 'YYYY-MM-DD HH:mm:ss') {\n return date.toISOString();\n}",
|
17 |
+
"services.js": "// Example of service functions for handling data operations\n\n// Function to fetch data from an external API\nasync function fetchExternalData(apiUrl) {\n try {\n const response = await fetch(apiUrl);\n if (response.ok) {\n return await response.json();\n } else {\n return { error: 'Failed to fetch data' };\n }\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to save user data to the database\nasync function saveUserData(name, email) {\n try {\n const newUser = new UserData({ name, email });\n await newUser.save();\n return { message: 'User saved successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to update user data in the database\nasync function updateUserData(userId, name, email) {\n try {\n const user = await UserData.findById(userId);\n if (!user) {\n return { error: 'User not found' };\n }\n if (name) {\n user.name = name;\n }\n if (email) {\n user.email = email;\n }\n await user.save();\n return { message: 'User updated successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to delete user data from the database\nasync function deleteUserData(userId) {\n try {\n const user = await UserData.findById(userId);\n if (!user) {\n return { error: 'User not found' };\n }\n await user.remove();\n return { message: 'User deleted successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}",
|
18 |
+
"react_component.js": "import React from 'react';\n\nimport './SearchResults.css';\n\nimport TrackList from '../TrackList/TrackList.js';\n\n constructor(props) {\n super(props);\n this.addTopFive = this.addTopFive.bind(this);\n this.addTopTen = this.addTopTen.bind(this);\n this.addAll = this.addAll.bind(this);\n }\n\n //add the top five tracks to the playlist\n addTopFive() {\n this.props.onAdd(this.props.searchResults.slice(0, 5));\n }\n\n //add top 10 tracks to the playlist\n addTopTen() {\n this.props.onAdd(this.props.searchResults.slice(0, 10));\n }\n\n addAll() {\n this.props.onAdd(this.props.searchResults);\n }\n render() {\n return (\n <div className=\"SearchResults\">\n <h2>Results</h2>\n <TrackList tracks={this.props.searchResults} onAdd={this.props.onAdd} onToggle={this.props.onToggle} currentTrack={this.props.currentTrack}/>\n </div>\n );\n }\n}\n\nexport default SearchResults;'",
|
19 |
+
"simple_styles.css": "/* Example of CSS styles for a web page */\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n}\n\nbutton:hover {\n background-color: #0056b3;\n}",
|
20 |
+
"media_queries.css": "/* Example of CSS styles with media queries for responsive design */\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n}\n\nbutton:hover {\n background-color: #0056b3;\n}\n\n/* Media query for smaller screens */\n@media (max-width: 768px) {\n button {\n padding: 8px 16px;\n font-size: 14px;\n }\n}",
|
21 |
+
"single_syntax_error_example.py": "# This is a sample Python file\n\nprint('Hello, world!'\n\n",
|
22 |
+
"multiple_syntax_errors.py": "def calculate_sum(lst):\n total = 0\n for num in lst\n total += num\n return total\n\nprint(calculate_sum([1, 2, 3, 4])\n\ndef string_manipulator(s):\n new_string = ''\n for char in s:\n if char == 'a':\n new_string += 'z'\n else:\n new_string += char\n return new_string\n\nprint(string_manipulate('banana'))\n\ndef find_max(numbers):\n max_num = numbers[0]\n for num in numbers\n if num > max_num\n max_num = num\n return max_num\n\nprint(find_max([1, 2, 3, 4, 5])",
|
23 |
+
"single_syntax_error_example.js": "//This is a sample JavaScript file\n\nfunction main() {\n console.log('Hello, world!');\n if (true) {\n console.log('hi');\n \n}\n\nmain();",
|
24 |
+
"multiple_syntax_errors.js": "function calculateSum(arr) {\n let total = 0;\n for (let i = 0; i < arr.length; i++ {\n total += arr[i];\n }\n return total;\n}\n\nconsole.log(calculateSum([1, 2, 3, 4);\n\nfunction stringManipulator(str) {\n let newString = '';\n for (let i = 0; i < str.length; i++) {\n if (str.charAt(i) === 'a')\n newString += 'z';\n } else {\n newString += str.charAt(i);\n }\n }\n return newString;\n}\n\nconsole.log(stringManipulator('banana'));\n\nfunction findMax(numbers) {\n let maxNum = numbers[0];\n for (let i = 1; i < numbers.length; i++) {\n if (numbers[i] > maxNum) {\n maxNum = numbers[i];\n }\n }\n return maxNum;\n}\n\nconsole.log(findMax([1, 2, 3, 4, 5]);",
|
25 |
+
"single_syntax_error_example.css": "\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n :hover {\n background-color: #0056b3;\n}\n",
|
26 |
+
"multiple_syntax_errors.css": "body {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n :hover {\n background-color: #0056b3;\n}\n\n/* Media query for smaller screens */\n@media (max-width: 768px) {\n button {\n padding: 8px 16px;\n font-size: 14px;\n }\n}"
|
27 |
+
}
|
requirements.txt
ADDED
File without changes
|
test_code_chunker.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from unittest.mock import patch
|
3 |
+
from Chunker import Chunker, CodeChunker
|
4 |
+
from utils import load_json
|
5 |
+
import tiktoken
|
6 |
+
import json
|
7 |
+
import os
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
# Mocking the count_tokens function as it's external and not the focus of these tests
|
12 |
+
def mock_count_tokens(string: str, encoding_name='gpt-4') -> int:
|
13 |
+
"""Returns the number of tokens in a text string."""
|
14 |
+
encoding = tiktoken.encoding_for_model(encoding_name)
|
15 |
+
num_tokens = len(encoding.encode(string))
|
16 |
+
return num_tokens
|
17 |
+
|
18 |
+
# Python Test Class
|
19 |
+
class TestCodeChunkerPython(unittest.TestCase):
|
20 |
+
def setUp(self):
|
21 |
+
self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
|
22 |
+
self.mock_count_tokens = self.patcher.start()
|
23 |
+
self.code_chunker = CodeChunker(file_extension='py')
|
24 |
+
self.mock_codebase = load_json('mock_codefiles.json')
|
25 |
+
|
26 |
+
def tearDown(self):
|
27 |
+
self.patcher.stop()
|
28 |
+
|
29 |
+
def test_chunk_simple_code(self):
|
30 |
+
py_code = self.mock_codebase['simple.py']
|
31 |
+
first_chunk_token_limit = mock_count_tokens("import sys")
|
32 |
+
print(f"first_chunk_token_limit = {first_chunk_token_limit}")
|
33 |
+
chunks = self.code_chunker.chunk(py_code, token_limit=25)
|
34 |
+
token_count = self.mock_count_tokens(py_code)
|
35 |
+
print(f"token_count = {token_count}")
|
36 |
+
print(f"original code:\n {py_code}")
|
37 |
+
Chunker.print_chunks(chunks)
|
38 |
+
full_code = Chunker.consolidate_chunks_into_file(chunks)
|
39 |
+
print(f"code after consolidation:\n {full_code}")
|
40 |
+
num_lines = Chunker.count_lines(full_code)
|
41 |
+
self.assertEqual(num_lines, len(py_code.split("\n"))) # The number of lines should be the same
|
42 |
+
self.assertIn(full_code, py_code) # The full code should be in the original code
|
43 |
+
self.assertEqual(len(chunks), 2) # There should be 2 chunks
|
44 |
+
self.assertIn("import sys", chunks[1]) # The first chunk should contain the import statement
|
45 |
+
self.assertIn("print('Hello, world!')", chunks[2]) # The second chunk should contain the print statement
|
46 |
+
|
47 |
+
def test_chunk_code_text_only(self):
|
48 |
+
py_code = self.mock_codebase['text_only.py']
|
49 |
+
chunks = self.code_chunker.chunk(py_code, token_limit=20)
|
50 |
+
Chunker.print_chunks(chunks)
|
51 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
52 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
53 |
+
self.assertEqual(num_lines, len(py_code.split("\n"))) # The number of lines should be the same
|
54 |
+
self.assertIn(py_code, final_code) # The full code should be in the original code
|
55 |
+
self.assertEqual(len(chunks), 1)
|
56 |
+
self.assertIn("This file is empty and should test the chunker's ability to handle empty files", chunks[1])
|
57 |
+
|
58 |
+
|
59 |
+
def test_chunk_code_with_routes(self):
|
60 |
+
py_code = self.mock_codebase['routes.py']
|
61 |
+
chunks = self.code_chunker.chunk(py_code, token_limit=20)
|
62 |
+
Chunker.print_chunks(chunks)
|
63 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
64 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
65 |
+
self.assertEqual(num_lines, len(py_code.split("\n"))) # The number of lines should be the same
|
66 |
+
self.assertIn(py_code, final_code) # The full code should be in the original code
|
67 |
+
|
68 |
+
|
69 |
+
def test_chunk_code_with_models(self):
|
70 |
+
py_code = self.mock_codebase['models.py']
|
71 |
+
chunks = self.code_chunker.chunk(py_code, token_limit=20)
|
72 |
+
Chunker.print_chunks(chunks)
|
73 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
74 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
75 |
+
self.assertEqual(num_lines, len(py_code.split("\n")))
|
76 |
+
self.assertIn(py_code, final_code)
|
77 |
+
|
78 |
+
def test_chunk_code_with_main(self):
|
79 |
+
py_code = self.mock_codebase['main.py']
|
80 |
+
chunks = self.code_chunker.chunk(py_code, token_limit=20)
|
81 |
+
Chunker.print_chunks(chunks)
|
82 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
83 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
84 |
+
self.assertEqual(num_lines, len(py_code.split("\n")))
|
85 |
+
self.assertIn(py_code, final_code)
|
86 |
+
|
87 |
+
def test_chunk_code_with_utilities(self):
|
88 |
+
py_code = self.mock_codebase['utilities.py']
|
89 |
+
chunks = self.code_chunker.chunk(py_code, token_limit=20)
|
90 |
+
Chunker.print_chunks(chunks)
|
91 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
92 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
93 |
+
self.assertEqual(num_lines, len(py_code.split("\n")))
|
94 |
+
self.assertIn(py_code, final_code)
|
95 |
+
|
96 |
+
def test_chunk_code_with_big_class(self):
|
97 |
+
py_code = self.mock_codebase['big_class.py']
|
98 |
+
chunks = self.code_chunker.chunk(py_code, token_limit=20)
|
99 |
+
Chunker.print_chunks(chunks)
|
100 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
101 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
102 |
+
self.assertEqual(num_lines, len(py_code.split("\n")))
|
103 |
+
self.assertIn(py_code, final_code)
|
104 |
+
|
105 |
+
# JavaScript Test Class
|
106 |
+
class TestCodeChunkerJavaScript(unittest.TestCase):
|
107 |
+
|
108 |
+
def setUp(self):
|
109 |
+
self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
|
110 |
+
self.mock_count_tokens = self.patcher.start()
|
111 |
+
self.code_chunker = CodeChunker(file_extension='js')
|
112 |
+
self.mock_codebase = load_json('mock_codefiles.json')
|
113 |
+
|
114 |
+
def tearDown(self):
|
115 |
+
self.patcher.stop()
|
116 |
+
|
117 |
+
def test_chunk_javascript_simple_code(self):
|
118 |
+
js_code = self.mock_codebase['simple.js']
|
119 |
+
chunks = self.code_chunker.chunk(js_code, token_limit=20)
|
120 |
+
Chunker.print_chunks(chunks)
|
121 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
122 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
123 |
+
self.assertEqual(num_lines, len(js_code.split("\n")))
|
124 |
+
self.assertIn(js_code, final_code)
|
125 |
+
|
126 |
+
|
127 |
+
def test_chunk_javascript_with_routes(self):
|
128 |
+
js_code = self.mock_codebase['routes.js']
|
129 |
+
chunks = self.code_chunker.chunk(js_code, token_limit=20)
|
130 |
+
Chunker.print_chunks(chunks)
|
131 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
132 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
133 |
+
self.assertEqual(num_lines, len(js_code.split("\n")))
|
134 |
+
self.assertIn(js_code, final_code)
|
135 |
+
|
136 |
+
|
137 |
+
def test_chunk_javascript_with_models(self):
|
138 |
+
js_code = self.mock_codebase['models.js']
|
139 |
+
chunks = self.code_chunker.chunk(js_code, token_limit=20)
|
140 |
+
Chunker.print_chunks(chunks)
|
141 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
142 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
143 |
+
self.assertEqual(num_lines, len(js_code.split("\n")))
|
144 |
+
self.assertIn(js_code, final_code)
|
145 |
+
|
146 |
+
def test_chunk_javascript_with_main(self):
|
147 |
+
js_code = self.mock_codebase['main.js']
|
148 |
+
chunks = self.code_chunker.chunk(js_code, token_limit=20)
|
149 |
+
Chunker.print_chunks(chunks)
|
150 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
151 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
152 |
+
self.assertEqual(num_lines, len(js_code.split("\n")))
|
153 |
+
self.assertIn(js_code, final_code)
|
154 |
+
|
155 |
+
def test_chunk_javascript_with_utilities(self):
|
156 |
+
js_code = self.mock_codebase['utilities.js']
|
157 |
+
chunks = self.code_chunker.chunk(js_code, token_limit=20)
|
158 |
+
Chunker.print_chunks(chunks)
|
159 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
160 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
161 |
+
self.assertEqual(num_lines, len(js_code.split("\n")))
|
162 |
+
self.assertIn(js_code, final_code)
|
163 |
+
|
164 |
+
def test_chunk_javascript_with_big_class(self):
|
165 |
+
js_code = self.mock_codebase['big_class.js']
|
166 |
+
chunks = self.code_chunker.chunk(js_code, token_limit=20)
|
167 |
+
Chunker.print_chunks(chunks)
|
168 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
169 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
170 |
+
self.assertEqual(num_lines, len(js_code.split("\n")))
|
171 |
+
self.assertIn(js_code, final_code)
|
172 |
+
|
173 |
+
def test_chunk_javascript_with_react_component(self):
|
174 |
+
js_code = self.mock_codebase['react_component.js']
|
175 |
+
chunks = self.code_chunker.chunk(js_code, token_limit=20)
|
176 |
+
Chunker.print_chunks(chunks)
|
177 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
178 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
179 |
+
self.assertEqual(num_lines, len(js_code.split("\n")))
|
180 |
+
self.assertIn(js_code, final_code)
|
181 |
+
|
182 |
+
# CSS Test Class
|
183 |
+
class TestCodeChunkerCSS(unittest.TestCase):
|
184 |
+
|
185 |
+
def setUp(self):
|
186 |
+
self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
|
187 |
+
self.mock_count_tokens = self.patcher.start()
|
188 |
+
self.code_chunker = CodeChunker(file_extension='css')
|
189 |
+
#Load the JSON data
|
190 |
+
self.mock_codebase = load_json('mock_codefiles.json')
|
191 |
+
|
192 |
+
def tearDown(self):
|
193 |
+
self.patcher.stop()
|
194 |
+
|
195 |
+
def test_chunk_css_with_media_query(self):
|
196 |
+
css_code = self.mock_codebase['media_queries.css']
|
197 |
+
chunks = self.code_chunker.chunk(css_code, token_limit=20)
|
198 |
+
Chunker.print_chunks(chunks)
|
199 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
200 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
201 |
+
self.assertEqual(num_lines, len(css_code.split("\n")))
|
202 |
+
self.assertIn(css_code, final_code)
|
203 |
+
|
204 |
+
def test_chunk_css_with_simple_css(self):
|
205 |
+
css_code = self.mock_codebase['simple_styles.css']
|
206 |
+
chunks = self.code_chunker.chunk(css_code, token_limit=20)
|
207 |
+
Chunker.print_chunks(chunks)
|
208 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
209 |
+
num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
|
210 |
+
self.assertEqual(num_lines, len(css_code.split("\n")))
|
211 |
+
self.assertIn(css_code, final_code)
|
212 |
+
|
213 |
+
if __name__ == '__main__':
|
214 |
+
unittest.main()
|
utils.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tiktoken
|
2 |
+
import json
|
3 |
+
|
4 |
+
def count_tokens(string: str, encoding_name: str) -> int:
|
5 |
+
"""Returns the number of tokens in a text string."""
|
6 |
+
encoding = tiktoken.encoding_for_model(encoding_name)
|
7 |
+
num_tokens = len(encoding.encode(string))
|
8 |
+
return num_tokens
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
def load_json(json_file):
|
13 |
+
with open(json_file) as f:
|
14 |
+
return json.load(f)
|