CintraAI commited on
Commit
a1983fb
·
1 Parent(s): 5b1b407

added codefiles

Browse files
Files changed (7) hide show
  1. .vscode/settings.json +11 -0
  2. Chunker.py +124 -0
  3. CodeParser.py +273 -0
  4. mock_codefiles.json +27 -0
  5. requirements.txt +0 -0
  6. test_code_chunker.py +214 -0
  7. utils.py +14 -0
.vscode/settings.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "python.testing.unittestArgs": [
3
+ "-v",
4
+ "-s",
5
+ ".",
6
+ "-p",
7
+ "test_*.py"
8
+ ],
9
+ "python.testing.pytestEnabled": false,
10
+ "python.testing.unittestEnabled": true
11
+ }
Chunker.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from CodeParser import CodeParser
3
+ from Utils import count_tokens
4
+
5
+
6
+
7
+ class Chunker(ABC):
8
+ def __init__(self, encoding_name="gpt-4"):
9
+ self.encoding_name = encoding_name
10
+
11
+ @abstractmethod
12
+ def chunk(self, content, token_limit):
13
+ pass
14
+
15
+ @abstractmethod
16
+ def get_chunk(self, chunked_content, chunk_number):
17
+ pass
18
+
19
+ @staticmethod
20
+ def print_chunks(chunks):
21
+ for chunk_number, chunk_code in chunks.items():
22
+ print(f"Chunk {chunk_number}:")
23
+ print("="*40)
24
+ print(chunk_code)
25
+ print("="*40)
26
+
27
+ @staticmethod
28
+ def consolidate_chunks_into_file(chunks):
29
+ return "\n".join(chunks.values())
30
+
31
+ @staticmethod
32
+ def count_lines(consolidated_chunks):
33
+ lines = consolidated_chunks.split("\n")
34
+ return len(lines)
35
+
36
+ class CodeChunker(Chunker):
37
+ def __init__(self, file_extension, encoding_name="gpt-4"):
38
+ super().__init__(encoding_name)
39
+ self.file_extension = file_extension
40
+
41
+ def chunk(self, code, token_limit) -> dict:
42
+ code_parser = CodeParser(self.file_extension)
43
+ chunks = {}
44
+ current_chunk = ""
45
+ token_count = 0
46
+ lines = code.split("\n")
47
+ i = 0
48
+ chunk_number = 1
49
+ start_line = 0
50
+ breakpoints = sorted(code_parser.get_lines_for_points_of_interest(code, self.file_extension))
51
+ comments = sorted(code_parser.get_lines_for_comments(code, self.file_extension))
52
+ adjusted_breakpoints = []
53
+ for bp in breakpoints:
54
+ current_line = bp - 1
55
+ highest_comment_line = None # Initialize with None to indicate no comment line has been found yet
56
+ while current_line in comments:
57
+ highest_comment_line = current_line # Update highest comment line found
58
+ current_line -= 1 # Move to the previous line
59
+
60
+ if highest_comment_line: # If a highest comment line exists, add it
61
+ adjusted_breakpoints.append(highest_comment_line)
62
+ else:
63
+ adjusted_breakpoints.append(bp) # If no comments were found before the breakpoint, add the original breakpoint
64
+
65
+ breakpoints = sorted(set(adjusted_breakpoints)) # Ensure breakpoints are unique and sorted
66
+
67
+ while i < len(lines):
68
+ line = lines[i]
69
+ new_token_count = count_tokens(line, self.encoding_name)
70
+ if token_count + new_token_count > token_limit:
71
+
72
+ # Set the stop line to the last breakpoint before the current line
73
+ if i in breakpoints:
74
+ stop_line = i
75
+ else:
76
+ stop_line = max(max([x for x in breakpoints if x < i], default=start_line), start_line)
77
+
78
+ # If the stop line is the same as the start line, it means we haven't reached a breakpoint yet and we need to move to the next line to find one
79
+ if stop_line == start_line and i not in breakpoints:
80
+ token_count += new_token_count
81
+ i += 1
82
+
83
+ # If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
84
+ elif stop_line == start_line and i == stop_line:
85
+ token_count += new_token_count
86
+ i += 1
87
+
88
+
89
+ # If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
90
+ elif stop_line == start_line and i in breakpoints:
91
+ current_chunk = "\n".join(lines[start_line:stop_line])
92
+ if current_chunk.strip(): # If the current chunk is not just whitespace
93
+ chunks[chunk_number] = current_chunk # Using chunk_number as key
94
+ chunk_number += 1
95
+
96
+ token_count = 0
97
+ start_line = i
98
+ i += 1
99
+
100
+ # If the stop line is different from the start line, it means we're at the end of a block
101
+ else:
102
+ current_chunk = "\n".join(lines[start_line:stop_line])
103
+ if current_chunk.strip():
104
+ chunks[chunk_number] = current_chunk # Using chunk_number as key
105
+ chunk_number += 1
106
+
107
+ i = stop_line
108
+ token_count = 0
109
+ start_line = stop_line
110
+ else:
111
+ # If the token count is still within the limit, add the line to the current chunk
112
+ token_count += new_token_count
113
+ i += 1
114
+
115
+ # Append remaining code, if any, ensuring it's not empty or whitespace
116
+ current_chunk_code = "\n".join(lines[start_line:])
117
+ if current_chunk_code.strip(): # Checks if the chunk is not just whitespace
118
+ chunks[chunk_number] = current_chunk_code # Using chunk_number as key
119
+
120
+ return chunks
121
+
122
+ def get_chunk(self, chunked_codebase, chunk_number):
123
+ return chunked_codebase[chunk_number]
124
+
CodeParser.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ from typing import List, Dict, Union, Tuple
4
+ from tree_sitter import Language, Parser, Node
5
+ from typing import Union, List
6
+ import logging
7
+
8
+
9
+ class CodeParser:
10
+ # Added a CACHE_DIR class attribute for caching
11
+ CACHE_DIR = os.path.expanduser("~/.code_parser_cache")
12
+
13
+ def __init__(self, file_extensions: Union[None, List[str], str] = None):
14
+ if isinstance(file_extensions, str):
15
+ file_extensions = [file_extensions]
16
+ self.language_extension_map = {
17
+ "py": "python",
18
+ "js": "javascript",
19
+ "jsx": "javascript",
20
+ "css": "css"
21
+ }
22
+ if file_extensions is None:
23
+ self.language_names = []
24
+ else:
25
+ self.language_names = [self.language_extension_map.get(ext) for ext in file_extensions if
26
+ ext in self.language_extension_map]
27
+ self.languages = {}
28
+ self._install_parsers()
29
+
30
+ def _install_parsers(self):
31
+ logging.basicConfig(level=logging.INFO) # Configure logging
32
+
33
+ # Ensure cache directory exists
34
+ if not os.path.exists(self.CACHE_DIR):
35
+ os.makedirs(self.CACHE_DIR)
36
+
37
+ # Configure logging to output to the terminal
38
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
39
+
40
+ for language in self.language_names:
41
+ repo_path = os.path.join(self.CACHE_DIR, f"tree-sitter-{language}")
42
+
43
+ if not os.path.exists(repo_path):
44
+ clone_command = f"git clone https://github.com/tree-sitter/tree-sitter-{language} {repo_path}"
45
+ result = subprocess.run(
46
+ clone_command,
47
+ shell=True,
48
+ stdout=subprocess.PIPE, # Capture standard output
49
+ stderr=subprocess.PIPE # Capture standard error
50
+ )
51
+
52
+ # Check if cloning was successful
53
+ if result.returncode != 0:
54
+ logging.error(
55
+ f"Failed to clone repository for {language}. Command: '{clone_command}'. Error: {result.stderr.decode('utf-8')}")
56
+ raise Exception(f"Failed to clone repository for {language}")
57
+
58
+ build_path = os.path.join(self.CACHE_DIR, f"build/{language}.so")
59
+ Language.build_library(build_path, [repo_path])
60
+
61
+ self.languages[language] = Language(build_path, language)
62
+
63
+ def parse_code(self, code: str, file_extension: str) -> Union[None, Node]:
64
+ language_name = self.language_extension_map.get(file_extension)
65
+ if language_name is None:
66
+ print(f"Unsupported file type: {file_extension}")
67
+ return None
68
+
69
+ language = self.languages.get(language_name)
70
+ if language is None:
71
+ print("Language parser not found")
72
+ return None
73
+
74
+ parser = Parser()
75
+ parser.set_language(language)
76
+ tree = parser.parse(bytes(code, "utf8"))
77
+
78
+ if tree is None:
79
+ print("Failed to parse the code")
80
+ return None
81
+
82
+ return tree.root_node
83
+
84
+ def extract_points_of_interest(self, node: Node, file_extension: str) -> List[Tuple[Node, str]]:
85
+ node_types_of_interest = self._get_node_types_of_interest(file_extension)
86
+
87
+ points_of_interest = []
88
+ if node.type in node_types_of_interest.keys():
89
+ points_of_interest.append((node, node_types_of_interest[node.type]))
90
+
91
+ for child in node.children:
92
+ points_of_interest.extend(self.extract_points_of_interest(child, file_extension))
93
+
94
+ return points_of_interest
95
+
96
+ def _get_node_types_of_interest(self, file_extension: str) -> Dict[str, str]:
97
+ node_types = {
98
+ 'py': {
99
+ 'import_statement': 'Import',
100
+ 'export_statement': 'Export',
101
+ 'class_definition': 'Class',
102
+ 'function_definition': 'Function',
103
+ },
104
+ 'css': {
105
+ 'tag_name': 'Tag',
106
+ '@media': 'Media Query',
107
+ },
108
+ 'js': {
109
+ 'import_statement': 'Import',
110
+ 'export_statement': 'Export',
111
+ 'class_declaration': 'Class',
112
+ 'function_declaration': 'Function',
113
+ 'arrow_function': 'Arrow Function',
114
+ 'statement_block': 'Block',
115
+ }
116
+ }
117
+
118
+ if file_extension in node_types.keys():
119
+ return node_types[file_extension]
120
+ elif file_extension == "jsx":
121
+ return node_types["js"]
122
+ else:
123
+ raise ValueError("Unsupported file type")
124
+
125
+
126
+ def _get_nodes_for_comments(self, file_extension: str) -> Dict[str, str]:
127
+ node_types = {
128
+ 'py': {
129
+ 'comment': 'Comment',
130
+ 'decorator': 'Decorator', # Broadened category
131
+ },
132
+ 'css': {
133
+ 'comment': 'Comment'
134
+ },
135
+ 'js': {
136
+ 'comment': 'Comment',
137
+ 'decorator': 'Decorator', # Broadened category
138
+ }
139
+ }
140
+
141
+ if file_extension in node_types.keys():
142
+ return node_types[file_extension]
143
+ elif file_extension == "jsx":
144
+ return node_types["js"]
145
+ else:
146
+ raise ValueError("Unsupported file type")
147
+
148
+ def extract_comments(self, node: Node, file_extension: str) -> List[Tuple[Node, str]]:
149
+ node_types_of_interest = self._get_nodes_for_comments(file_extension)
150
+
151
+ comments = []
152
+ if node.type in node_types_of_interest:
153
+ comments.append((node, node_types_of_interest[node.type]))
154
+
155
+ for child in node.children:
156
+ comments.extend(self.extract_comments(child, file_extension))
157
+
158
+ return comments
159
+
160
+ def get_lines_for_points_of_interest(self, code: str, file_extension: str) -> List[int]:
161
+ language_name = self.language_extension_map.get(file_extension)
162
+ if language_name is None:
163
+ raise ValueError("Unsupported file type")
164
+
165
+ language = self.languages.get(language_name)
166
+ if language is None:
167
+ raise ValueError("Language parser not found")
168
+
169
+ parser = Parser()
170
+ parser.set_language(language)
171
+
172
+ tree = parser.parse(bytes(code, "utf8"))
173
+
174
+ root_node = tree.root_node
175
+ points_of_interest = self.extract_points_of_interest(root_node, file_extension)
176
+
177
+ line_numbers_with_type_of_interest = {}
178
+
179
+ for node, type_of_interest in points_of_interest:
180
+ start_line = node.start_point[0]
181
+ if type_of_interest not in line_numbers_with_type_of_interest:
182
+ line_numbers_with_type_of_interest[type_of_interest] = []
183
+
184
+ if start_line not in line_numbers_with_type_of_interest[type_of_interest]:
185
+ line_numbers_with_type_of_interest[type_of_interest].append(start_line)
186
+
187
+ lines_of_interest = []
188
+ for _, line_numbers in line_numbers_with_type_of_interest.items():
189
+ lines_of_interest.extend(line_numbers)
190
+
191
+ return lines_of_interest
192
+
193
+ def get_lines_for_comments(self, code: str, file_extension: str) -> List[int]:
194
+ language_name = self.language_extension_map.get(file_extension)
195
+ if language_name is None:
196
+ raise ValueError("Unsupported file type")
197
+
198
+ language = self.languages.get(language_name)
199
+ if language is None:
200
+ raise ValueError("Language parser not found")
201
+
202
+ parser = Parser()
203
+ parser.set_language(language)
204
+
205
+ tree = parser.parse(bytes(code, "utf8"))
206
+
207
+ root_node = tree.root_node
208
+ comments = self.extract_comments(root_node, file_extension)
209
+
210
+ line_numbers_with_comments = {}
211
+
212
+ for node, type_of_interest in comments:
213
+ start_line = node.start_point[0]
214
+ if type_of_interest not in line_numbers_with_comments:
215
+ line_numbers_with_comments[type_of_interest] = []
216
+
217
+ if start_line not in line_numbers_with_comments[type_of_interest]:
218
+ line_numbers_with_comments[type_of_interest].append(start_line)
219
+
220
+ lines_of_interest = []
221
+ for _, line_numbers in line_numbers_with_comments.items():
222
+ lines_of_interest.extend(line_numbers)
223
+
224
+ return lines_of_interest
225
+
226
+ def print_all_line_types(self, code: str, file_extension: str):
227
+ language_name = self.language_extension_map.get(file_extension)
228
+ if language_name is None:
229
+ print(f"Unsupported file type: {file_extension}")
230
+ return
231
+
232
+ language = self.languages.get(language_name)
233
+ if language is None:
234
+ print("Language parser not found")
235
+ return
236
+
237
+ parser = Parser()
238
+ parser.set_language(language)
239
+ tree = parser.parse(bytes(code, "utf8"))
240
+
241
+ root_node = tree.root_node
242
+ line_to_node_type = self.map_line_to_node_type(root_node)
243
+
244
+ code_lines = code.split('\n')
245
+
246
+ for line_num, node_types in line_to_node_type.items():
247
+ line_content = code_lines[line_num - 1] # Adjusting index for zero-based indexing
248
+ print(f"line {line_num}: {', '.join(node_types)} | Code: {line_content}")
249
+
250
+
251
+ def map_line_to_node_type(self, node, line_to_node_type=None, depth=0):
252
+ if line_to_node_type is None:
253
+ line_to_node_type = {}
254
+
255
+ start_line = node.start_point[0] + 1 # Tree-sitter lines are 0-indexed; converting to 1-indexed
256
+
257
+ # Only add the node type if it's the start line of the node
258
+ if start_line not in line_to_node_type:
259
+ line_to_node_type[start_line] = []
260
+ line_to_node_type[start_line].append(node.type)
261
+
262
+ for child in node.children:
263
+ self.map_line_to_node_type(child, line_to_node_type, depth + 1)
264
+
265
+ return line_to_node_type
266
+
267
+ def print_simple_line_numbers_with_code(self, code: str):
268
+
269
+ code_lines = code.split('\n')
270
+
271
+ for i, line in enumerate(code_lines):
272
+ print(f"Line {i + 1}: {line}")
273
+
mock_codefiles.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "simple.py": "import sys\n\n# This is a sample Python file\n\ndef main():\n print('Hello, world!')\n\nif __name__ == '__main__':\n main()",
3
+ "text_only.py": "# This file is empty and should test the chunker's ability to handle empty files\n",
4
+ "routes.py": "from flask import Flask, jsonify, request, redirect, url_for\napp = Flask(__name__)\n\n@app.route('/', methods=['GET'])\ndef home():\n return '<h1>Welcome to the Home Page</h1>', 200\n\n@authenticate # Hypothetical decorator for authentication\n@log_access # Hypothetical decorator for logging access\n@app.route('/api/data', methods=['GET'])\ndef get_data():\n # Simulate fetching data from a database or external service\n data = {'key': 'This is some data'}\n return jsonify(data), 200\n\n@app.route('/api/data/<int:data_id>', methods=['GET'])\ndef get_data_by_id(data_id):\n # Simulate fetching specific data by ID\n data = {'id': data_id, 'value': 'Specific data based on ID'}\n return jsonify(data), 200\n\n@app.route('/api/data', methods=['POST'])\ndef post_data():\n data = request.json\n # Simulate saving data to a database\n return jsonify({'message': 'Data saved successfully', 'data': data}), 201\n\n@app.route('/api/data/<int:data_id>', methods=['PUT'])\ndef update_data(data_id):\n data = request.json\n # Simulate updating data in a database\n return jsonify({'message': 'Data updated successfully', 'id': data_id, 'data': data}), 200\n\n@app.route('/api/data/<int:data_id>', methods=['DELETE'])\ndef delete_data(data_id):\n # Simulate deleting data by ID\n return jsonify({'message': 'Data deleted successfully', 'id': data_id}), 200\n\n@app.route('/redirect', methods=['GET'])\ndef example_redirect():\n return redirect(url_for('home'))\n\nif __name__ == '__main__':\n app.run(debug=True)",
5
+ "models.py": "from sqlalchemy import Column, Integer, String, ForeignKey\nfrom sqlalchemy.ext.declarative import declarative_base\nfrom sqlalchemy.orm import relationship\n\nBase = declarative_base()\n\nclass User(Base):\n __tablename__ = 'users'\n id = Column(Integer, primary_key=True)\n username = Column(String, unique=True, nullable=False)\n email = Column(String, unique=True, nullable=False)\n\n posts = relationship('Post', backref='author')\n\nclass Post(Base):\n __tablename__ = 'posts'\n id = Column(Integer, primary_key=True)\n title = Column(String, nullable=False)\n content = Column(String, nullable=False)\n user_id = Column(Integer, ForeignKey('users.id'))",
6
+ "big_class.py": "class BigClass:\n def __init__(self, name, age):\n self.name = name\n self.age = age\n\n def get_name(self):\n return self.name\n\n def get_age(self):\n return self.age\n\n def set_name(self, name):\n self.name = name\n\n def set_age(self, age):\n self.age = age\n\n def __str__(self):\n return f'Name: {self.name}, Age: {self.age}'",
7
+ "main.py": "from flask import Flask\nfrom routes import app as routes_app\n\n# Create the Flask application\napp = Flask(__name__)\n\n# Register the routes from the routes.py file\napp.register_blueprint(routes_app)\n\n# Configuration settings for the app can go here\napp.config['DEBUG'] = True\napp.config['SECRET_KEY'] = 'your_secret_key'\n\n# More complex app initialization steps can be added here\n# For example, database initialization, login manager setups, etc.\n\n# This function can be used to create a database schema\n def create_database(app):\n if not path.exists('yourdatabase.db'):\n db.create_all(app=app)\n print('Created Database!')\n\nif __name__ == '__main__':\n # Optionally, call database creation or other setup functions here\n create_database(app)\n app.run()",
8
+ "utilities.py": "import hashlib\nimport uuid\nimport re\nfrom datetime import datetime, timedelta\n\n# Function to hash a password\ndef hash_password(password):\n salt = uuid.uuid4().hex\n return hashlib.sha256(salt.encode() + password.encode()).hexdigest() + ':' + salt\n\n# Function to check a hashed password\ndef check_password(hashed_password, user_password):\n password, salt = hashed_password.split(':')\n return password == hashlib.sha256(salt.encode() + user_password.encode()).hexdigest()\n\n# Function to validate an email address\ndef validate_email(email):\n pattern = r\"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$\"\n return re.match(pattern, email) is not None\n\n# Function to generate a token expiration date\ndef generate_expiration_date(days=1):\n return datetime.now() + timedelta(days=days)\n\n# Function to convert a string to datetime\ndef string_to_datetime(date_string, format='%Y-%m-%d %H:%M:%S'):\n return datetime.strptime(date_string, format)\n\n# Function to convert datetime to string\ndef datetime_to_string(date, format='%Y-%m-%d %H:%M:%S'):\n return date.strftime(format)",
9
+ "services.py": "import requests\nfrom flask_sqlalchemy import SQLAlchemy\n\n# Assuming an initialized Flask app with SQLAlchemy\ndb = SQLAlchemy()\n\n# Example of a model for database operations\nclass UserData(db.Model):\n id = db.Column(db.Integer, primary_key=True)\n name = db.Column(db.String(100), nullable=False)\n email = db.Column(db.String(100), unique=True, nullable=False)\n\n def __repr__(self):\n return f'<User {self.name}>'\n\n# Function to fetch data from an external API\ndef fetch_external_data(api_url):\n response = requests.get(api_url)\n if response.status_code == 200:\n return response.json()\n else:\n return {'error': 'Failed to fetch data'}\n\n# Function to save user data to the database\ndef save_user_data(name, email):\n new_user = UserData(name=name, email=email)\n db.session.add(new_user)\n try:\n db.session.commit()\n return {'message': 'User saved successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}\n\n# Function to update user data in the database\ndef update_user_data(user_id, name=None, email=None):\n user = UserData.query.get(user_id)\n if not user:\n return {'error': 'User not found'}\n if name:\n user.name = name\n if email:\n user.email = email\n try:\n db.session.commit()\n return {'message': 'User updated successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}\n\n# Function to delete user data from the database\ndef delete_user_data(user_id):\n user = UserData.query.get(user_id)\n if not user:\n return {'error': 'User not found'}\n try:\n db.session.delete(user)\n db.session.commit()\n return {'message': 'User deleted successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}",
10
+ "simple.js": "// This is a sample JavaScript file\n\nfunction main() {\n console.log('Hello, world!');\n}\n\nmain();",
11
+ "text_only.js": "// This file is empty and should test the chunker's ability to handle empty files\n",
12
+ "routes.js": "const express = require('express');\nconst router = express.Router();\n\n// Example of a simple route\nrouter.get('/', (req, res) => {\n res.send('Welcome to the Home Page');\n});\n\n// Example of a route with a parameter\nrouter.get('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n res.json({ id: dataId, value: 'Specific data based on ID' });\n});\n\n// Example of a route that handles POST requests\nrouter.post('/api/data', (req, res) => {\n const data = req.body;\n res.status(201).json({ message: 'Data saved successfully', data: data });\n});\n\n// Example of a route that handles PUT requests\nrouter.put('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n const data = req.body;\n res.json({ message: 'Data updated successfully', id: dataId, data: data });\n});\n\n// Example of a route that handles DELETE requests\nrouter.delete('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n res.json({ message: 'Data deleted successfully', id: dataId });\n});\n\nmodule.exports = router;",
13
+ "models.js": "const mongoose = require('mongoose');\n\n// Example of a simple Mongoose model\nconst userSchema = new mongoose.Schema({\n username: { type: String, required: true, unique: true },\n email: { type: String, required: true, unique: true }\n});\n\nconst User = mongoose.model('User', userSchema);\n\nmodule.exports = User;",
14
+ "big_class.js": "// This is a sample JavaScript class with a large number of methods\n\nclass BigClass {\n constructor(name, age) {\n this.name = name;\n this.age = age;\n }\n\n getName() {\n return this.name;\n }\n\n getAge() {\n return this.age;\n }\n\n setName(name) {\n this.name = name;\n }\n\n setAge(age) {\n this.age = age;\n }\n\n toString() {\n return `Name: ${this.name}, Age: ${this.age}`;\n }\n}\n\nmodule.exports = BigClass;",
15
+ "main.js": "const express = require('express');\nconst routes = require('./routes');\n\n// Create the Express application\nconst app = express();\n\n// Register the routes from the routes.js file\napp.use('/', routes);\n\n// Configuration settings for the app can go here\napp.set('port', process.env.PORT || 3000);\n\n// More complex app initialization steps can be added here\n// For example, database initialization, middleware setups, etc.\n\n// This function can be used to create a database schema\nfunction createDatabase() {\n // Code to create database schema\n console.log('Created Database!');\n}\n\n// Optionally, call database creation or other setup functions here\ncreateDatabase();\n\n// Start the server\napp.listen(app.get('port'), () => {\n console.log(`Server running on port ${app.get('port')}`);\n});",
16
+ "utilities.js": "// Example of utility functions for common tasks\n\n// Function to hash a password\nfunction hashPassword(password) {\n const salt = uuidv4();\n return sha256(salt + password) + ':' + salt;\n}\n\n// Function to check a hashed password\nfunction checkPassword(hashedPassword, userPassword) {\n const [password, salt] = hashedPassword.split(':');\n return sha256(salt + userPassword) === password;\n}\n\n// Function to validate an email address\nfunction validateEmail(email) {\n const pattern = /^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$/;\n return pattern.test(email);\n}\n\n// Function to generate a token expiration date\nfunction generateExpirationDate(days = 1) {\n const expirationDate = new Date();\n expirationDate.setDate(expirationDate.getDate() + days);\n return expirationDate;\n}\n\n// Function to convert a string to a Date object\nfunction stringToDate(dateString, format = 'YYYY-MM-DD HH:mm:ss') {\n return new Date(dateString);\n}\n\n// Function to convert a Date object to a string\nfunction dateToString(date, format = 'YYYY-MM-DD HH:mm:ss') {\n return date.toISOString();\n}",
17
+ "services.js": "// Example of service functions for handling data operations\n\n// Function to fetch data from an external API\nasync function fetchExternalData(apiUrl) {\n try {\n const response = await fetch(apiUrl);\n if (response.ok) {\n return await response.json();\n } else {\n return { error: 'Failed to fetch data' };\n }\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to save user data to the database\nasync function saveUserData(name, email) {\n try {\n const newUser = new UserData({ name, email });\n await newUser.save();\n return { message: 'User saved successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to update user data in the database\nasync function updateUserData(userId, name, email) {\n try {\n const user = await UserData.findById(userId);\n if (!user) {\n return { error: 'User not found' };\n }\n if (name) {\n user.name = name;\n }\n if (email) {\n user.email = email;\n }\n await user.save();\n return { message: 'User updated successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to delete user data from the database\nasync function deleteUserData(userId) {\n try {\n const user = await UserData.findById(userId);\n if (!user) {\n return { error: 'User not found' };\n }\n await user.remove();\n return { message: 'User deleted successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}",
18
+ "react_component.js": "import React from 'react';\n\nimport './SearchResults.css';\n\nimport TrackList from '../TrackList/TrackList.js';\n\n constructor(props) {\n super(props);\n this.addTopFive = this.addTopFive.bind(this);\n this.addTopTen = this.addTopTen.bind(this);\n this.addAll = this.addAll.bind(this);\n }\n\n //add the top five tracks to the playlist\n addTopFive() {\n this.props.onAdd(this.props.searchResults.slice(0, 5));\n }\n\n //add top 10 tracks to the playlist\n addTopTen() {\n this.props.onAdd(this.props.searchResults.slice(0, 10));\n }\n\n addAll() {\n this.props.onAdd(this.props.searchResults);\n }\n render() {\n return (\n <div className=\"SearchResults\">\n <h2>Results</h2>\n <TrackList tracks={this.props.searchResults} onAdd={this.props.onAdd} onToggle={this.props.onToggle} currentTrack={this.props.currentTrack}/>\n </div>\n );\n }\n}\n\nexport default SearchResults;'",
19
+ "simple_styles.css": "/* Example of CSS styles for a web page */\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n}\n\nbutton:hover {\n background-color: #0056b3;\n}",
20
+ "media_queries.css": "/* Example of CSS styles with media queries for responsive design */\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n}\n\nbutton:hover {\n background-color: #0056b3;\n}\n\n/* Media query for smaller screens */\n@media (max-width: 768px) {\n button {\n padding: 8px 16px;\n font-size: 14px;\n }\n}",
21
+ "single_syntax_error_example.py": "# This is a sample Python file\n\nprint('Hello, world!'\n\n",
22
+ "multiple_syntax_errors.py": "def calculate_sum(lst):\n total = 0\n for num in lst\n total += num\n return total\n\nprint(calculate_sum([1, 2, 3, 4])\n\ndef string_manipulator(s):\n new_string = ''\n for char in s:\n if char == 'a':\n new_string += 'z'\n else:\n new_string += char\n return new_string\n\nprint(string_manipulate('banana'))\n\ndef find_max(numbers):\n max_num = numbers[0]\n for num in numbers\n if num > max_num\n max_num = num\n return max_num\n\nprint(find_max([1, 2, 3, 4, 5])",
23
+ "single_syntax_error_example.js": "//This is a sample JavaScript file\n\nfunction main() {\n console.log('Hello, world!');\n if (true) {\n console.log('hi');\n \n}\n\nmain();",
24
+ "multiple_syntax_errors.js": "function calculateSum(arr) {\n let total = 0;\n for (let i = 0; i < arr.length; i++ {\n total += arr[i];\n }\n return total;\n}\n\nconsole.log(calculateSum([1, 2, 3, 4);\n\nfunction stringManipulator(str) {\n let newString = '';\n for (let i = 0; i < str.length; i++) {\n if (str.charAt(i) === 'a')\n newString += 'z';\n } else {\n newString += str.charAt(i);\n }\n }\n return newString;\n}\n\nconsole.log(stringManipulator('banana'));\n\nfunction findMax(numbers) {\n let maxNum = numbers[0];\n for (let i = 1; i < numbers.length; i++) {\n if (numbers[i] > maxNum) {\n maxNum = numbers[i];\n }\n }\n return maxNum;\n}\n\nconsole.log(findMax([1, 2, 3, 4, 5]);",
25
+ "single_syntax_error_example.css": "\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n :hover {\n background-color: #0056b3;\n}\n",
26
+ "multiple_syntax_errors.css": "body {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n :hover {\n background-color: #0056b3;\n}\n\n/* Media query for smaller screens */\n@media (max-width: 768px) {\n button {\n padding: 8px 16px;\n font-size: 14px;\n }\n}"
27
+ }
requirements.txt ADDED
File without changes
test_code_chunker.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from unittest.mock import patch
3
+ from Chunker import Chunker, CodeChunker
4
+ from utils import load_json
5
+ import tiktoken
6
+ import json
7
+ import os
8
+
9
+
10
+
11
+ # Mocking the count_tokens function as it's external and not the focus of these tests
12
+ def mock_count_tokens(string: str, encoding_name='gpt-4') -> int:
13
+ """Returns the number of tokens in a text string."""
14
+ encoding = tiktoken.encoding_for_model(encoding_name)
15
+ num_tokens = len(encoding.encode(string))
16
+ return num_tokens
17
+
18
+ # Python Test Class
19
+ class TestCodeChunkerPython(unittest.TestCase):
20
+ def setUp(self):
21
+ self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
22
+ self.mock_count_tokens = self.patcher.start()
23
+ self.code_chunker = CodeChunker(file_extension='py')
24
+ self.mock_codebase = load_json('mock_codefiles.json')
25
+
26
+ def tearDown(self):
27
+ self.patcher.stop()
28
+
29
+ def test_chunk_simple_code(self):
30
+ py_code = self.mock_codebase['simple.py']
31
+ first_chunk_token_limit = mock_count_tokens("import sys")
32
+ print(f"first_chunk_token_limit = {first_chunk_token_limit}")
33
+ chunks = self.code_chunker.chunk(py_code, token_limit=25)
34
+ token_count = self.mock_count_tokens(py_code)
35
+ print(f"token_count = {token_count}")
36
+ print(f"original code:\n {py_code}")
37
+ Chunker.print_chunks(chunks)
38
+ full_code = Chunker.consolidate_chunks_into_file(chunks)
39
+ print(f"code after consolidation:\n {full_code}")
40
+ num_lines = Chunker.count_lines(full_code)
41
+ self.assertEqual(num_lines, len(py_code.split("\n"))) # The number of lines should be the same
42
+ self.assertIn(full_code, py_code) # The full code should be in the original code
43
+ self.assertEqual(len(chunks), 2) # There should be 2 chunks
44
+ self.assertIn("import sys", chunks[1]) # The first chunk should contain the import statement
45
+ self.assertIn("print('Hello, world!')", chunks[2]) # The second chunk should contain the print statement
46
+
47
+ def test_chunk_code_text_only(self):
48
+ py_code = self.mock_codebase['text_only.py']
49
+ chunks = self.code_chunker.chunk(py_code, token_limit=20)
50
+ Chunker.print_chunks(chunks)
51
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
52
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
53
+ self.assertEqual(num_lines, len(py_code.split("\n"))) # The number of lines should be the same
54
+ self.assertIn(py_code, final_code) # The full code should be in the original code
55
+ self.assertEqual(len(chunks), 1)
56
+ self.assertIn("This file is empty and should test the chunker's ability to handle empty files", chunks[1])
57
+
58
+
59
+ def test_chunk_code_with_routes(self):
60
+ py_code = self.mock_codebase['routes.py']
61
+ chunks = self.code_chunker.chunk(py_code, token_limit=20)
62
+ Chunker.print_chunks(chunks)
63
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
64
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
65
+ self.assertEqual(num_lines, len(py_code.split("\n"))) # The number of lines should be the same
66
+ self.assertIn(py_code, final_code) # The full code should be in the original code
67
+
68
+
69
+ def test_chunk_code_with_models(self):
70
+ py_code = self.mock_codebase['models.py']
71
+ chunks = self.code_chunker.chunk(py_code, token_limit=20)
72
+ Chunker.print_chunks(chunks)
73
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
74
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
75
+ self.assertEqual(num_lines, len(py_code.split("\n")))
76
+ self.assertIn(py_code, final_code)
77
+
78
+ def test_chunk_code_with_main(self):
79
+ py_code = self.mock_codebase['main.py']
80
+ chunks = self.code_chunker.chunk(py_code, token_limit=20)
81
+ Chunker.print_chunks(chunks)
82
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
83
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
84
+ self.assertEqual(num_lines, len(py_code.split("\n")))
85
+ self.assertIn(py_code, final_code)
86
+
87
+ def test_chunk_code_with_utilities(self):
88
+ py_code = self.mock_codebase['utilities.py']
89
+ chunks = self.code_chunker.chunk(py_code, token_limit=20)
90
+ Chunker.print_chunks(chunks)
91
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
92
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
93
+ self.assertEqual(num_lines, len(py_code.split("\n")))
94
+ self.assertIn(py_code, final_code)
95
+
96
+ def test_chunk_code_with_big_class(self):
97
+ py_code = self.mock_codebase['big_class.py']
98
+ chunks = self.code_chunker.chunk(py_code, token_limit=20)
99
+ Chunker.print_chunks(chunks)
100
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
101
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
102
+ self.assertEqual(num_lines, len(py_code.split("\n")))
103
+ self.assertIn(py_code, final_code)
104
+
105
+ # JavaScript Test Class
106
+ class TestCodeChunkerJavaScript(unittest.TestCase):
107
+
108
+ def setUp(self):
109
+ self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
110
+ self.mock_count_tokens = self.patcher.start()
111
+ self.code_chunker = CodeChunker(file_extension='js')
112
+ self.mock_codebase = load_json('mock_codefiles.json')
113
+
114
+ def tearDown(self):
115
+ self.patcher.stop()
116
+
117
+ def test_chunk_javascript_simple_code(self):
118
+ js_code = self.mock_codebase['simple.js']
119
+ chunks = self.code_chunker.chunk(js_code, token_limit=20)
120
+ Chunker.print_chunks(chunks)
121
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
122
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
123
+ self.assertEqual(num_lines, len(js_code.split("\n")))
124
+ self.assertIn(js_code, final_code)
125
+
126
+
127
+ def test_chunk_javascript_with_routes(self):
128
+ js_code = self.mock_codebase['routes.js']
129
+ chunks = self.code_chunker.chunk(js_code, token_limit=20)
130
+ Chunker.print_chunks(chunks)
131
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
132
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
133
+ self.assertEqual(num_lines, len(js_code.split("\n")))
134
+ self.assertIn(js_code, final_code)
135
+
136
+
137
+ def test_chunk_javascript_with_models(self):
138
+ js_code = self.mock_codebase['models.js']
139
+ chunks = self.code_chunker.chunk(js_code, token_limit=20)
140
+ Chunker.print_chunks(chunks)
141
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
142
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
143
+ self.assertEqual(num_lines, len(js_code.split("\n")))
144
+ self.assertIn(js_code, final_code)
145
+
146
+ def test_chunk_javascript_with_main(self):
147
+ js_code = self.mock_codebase['main.js']
148
+ chunks = self.code_chunker.chunk(js_code, token_limit=20)
149
+ Chunker.print_chunks(chunks)
150
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
151
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
152
+ self.assertEqual(num_lines, len(js_code.split("\n")))
153
+ self.assertIn(js_code, final_code)
154
+
155
+ def test_chunk_javascript_with_utilities(self):
156
+ js_code = self.mock_codebase['utilities.js']
157
+ chunks = self.code_chunker.chunk(js_code, token_limit=20)
158
+ Chunker.print_chunks(chunks)
159
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
160
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
161
+ self.assertEqual(num_lines, len(js_code.split("\n")))
162
+ self.assertIn(js_code, final_code)
163
+
164
+ def test_chunk_javascript_with_big_class(self):
165
+ js_code = self.mock_codebase['big_class.js']
166
+ chunks = self.code_chunker.chunk(js_code, token_limit=20)
167
+ Chunker.print_chunks(chunks)
168
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
169
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
170
+ self.assertEqual(num_lines, len(js_code.split("\n")))
171
+ self.assertIn(js_code, final_code)
172
+
173
+ def test_chunk_javascript_with_react_component(self):
174
+ js_code = self.mock_codebase['react_component.js']
175
+ chunks = self.code_chunker.chunk(js_code, token_limit=20)
176
+ Chunker.print_chunks(chunks)
177
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
178
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
179
+ self.assertEqual(num_lines, len(js_code.split("\n")))
180
+ self.assertIn(js_code, final_code)
181
+
182
+ # CSS Test Class
183
+ class TestCodeChunkerCSS(unittest.TestCase):
184
+
185
+ def setUp(self):
186
+ self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
187
+ self.mock_count_tokens = self.patcher.start()
188
+ self.code_chunker = CodeChunker(file_extension='css')
189
+ #Load the JSON data
190
+ self.mock_codebase = load_json('mock_codefiles.json')
191
+
192
+ def tearDown(self):
193
+ self.patcher.stop()
194
+
195
+ def test_chunk_css_with_media_query(self):
196
+ css_code = self.mock_codebase['media_queries.css']
197
+ chunks = self.code_chunker.chunk(css_code, token_limit=20)
198
+ Chunker.print_chunks(chunks)
199
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
200
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
201
+ self.assertEqual(num_lines, len(css_code.split("\n")))
202
+ self.assertIn(css_code, final_code)
203
+
204
+ def test_chunk_css_with_simple_css(self):
205
+ css_code = self.mock_codebase['simple_styles.css']
206
+ chunks = self.code_chunker.chunk(css_code, token_limit=20)
207
+ Chunker.print_chunks(chunks)
208
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
209
+ num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
210
+ self.assertEqual(num_lines, len(css_code.split("\n")))
211
+ self.assertIn(css_code, final_code)
212
+
213
+ if __name__ == '__main__':
214
+ unittest.main()
utils.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+ import json
3
+
4
+ def count_tokens(string: str, encoding_name: str) -> int:
5
+ """Returns the number of tokens in a text string."""
6
+ encoding = tiktoken.encoding_for_model(encoding_name)
7
+ num_tokens = len(encoding.encode(string))
8
+ return num_tokens
9
+
10
+
11
+
12
+ def load_json(json_file):
13
+ with open(json_file) as f:
14
+ return json.load(f)