Spaces:
Running
Running
Add code chunking functionality
Browse files- .github/workflows/main.yml +20 -0
- .idea/.gitignore +8 -0
- .idea/code-chunker.iml +15 -0
- .idea/misc.xml +9 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- Chunker.py +14 -14
- LICENSE +21 -201
- README.md +30 -9
- __pycache__/Chunker.cpython-312.pyc +0 -0
- __pycache__/test_code_chunker.cpython-312.pyc +0 -0
- __pycache__/utils.cpython-312.pyc +0 -0
- app.py +88 -0
- cintra/documentation.json +7 -0
- mock_codefiles.json +29 -26
- requirements.txt +24 -46
- test_code_chunker.py +70 -0
- utils.py +0 -2
.github/workflows/main.yml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v3
|
14 |
+
with:
|
15 |
+
fetch-depth: 0
|
16 |
+
lfs: true
|
17 |
+
- name: Push to hub
|
18 |
+
env:
|
19 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
20 |
+
run: git push https://jsham042:$HF_TOKEN@huggingface.co/spaces/CintraAI/code-chunker main
|
.idea/.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
4 |
+
# Editor-based HTTP Client requests
|
5 |
+
/httpRequests/
|
6 |
+
# Datasource local storage ignored files
|
7 |
+
/dataSources/
|
8 |
+
/dataSources.local.xml
|
.idea/code-chunker.iml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="JAVA_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
4 |
+
<exclude-output />
|
5 |
+
<content url="file://$MODULE_DIR$">
|
6 |
+
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
7 |
+
</content>
|
8 |
+
<orderEntry type="inheritedJdk" />
|
9 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
10 |
+
</component>
|
11 |
+
<component name="PackageRequirementsSettings">
|
12 |
+
<option name="removeUnused" value="true" />
|
13 |
+
<option name="modifyBaseFiles" value="true" />
|
14 |
+
</component>
|
15 |
+
</module>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="Black">
|
4 |
+
<option name="sdkName" value="Python 3.12 (code-chunker)" />
|
5 |
+
</component>
|
6 |
+
<component name="ProjectRootManager" version="2" languageLevel="JDK_21" project-jdk-name="Python 3.12 (code-chunker)" project-jdk-type="Python SDK">
|
7 |
+
<output url="file://$PROJECT_DIR$/out" />
|
8 |
+
</component>
|
9 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/code-chunker.iml" filepath="$PROJECT_DIR$/.idea/code-chunker.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
Chunker.py
CHANGED
@@ -3,7 +3,6 @@ from CodeParser import CodeParser
|
|
3 |
from utils import count_tokens
|
4 |
|
5 |
|
6 |
-
|
7 |
class Chunker(ABC):
|
8 |
def __init__(self, encoding_name="gpt-4"):
|
9 |
self.encoding_name = encoding_name
|
@@ -20,19 +19,20 @@ class Chunker(ABC):
|
|
20 |
def print_chunks(chunks):
|
21 |
for chunk_number, chunk_code in chunks.items():
|
22 |
print(f"Chunk {chunk_number}:")
|
23 |
-
print("="*40)
|
24 |
print(chunk_code)
|
25 |
-
print("="*40)
|
26 |
|
27 |
@staticmethod
|
28 |
def consolidate_chunks_into_file(chunks):
|
29 |
return "\n".join(chunks.values())
|
30 |
-
|
31 |
@staticmethod
|
32 |
def count_lines(consolidated_chunks):
|
33 |
lines = consolidated_chunks.split("\n")
|
34 |
return len(lines)
|
35 |
|
|
|
36 |
class CodeChunker(Chunker):
|
37 |
def __init__(self, file_extension, encoding_name="gpt-4"):
|
38 |
super().__init__(encoding_name)
|
@@ -60,15 +60,16 @@ class CodeChunker(Chunker):
|
|
60 |
if highest_comment_line: # If a highest comment line exists, add it
|
61 |
adjusted_breakpoints.append(highest_comment_line)
|
62 |
else:
|
63 |
-
adjusted_breakpoints.append(
|
|
|
64 |
|
65 |
breakpoints = sorted(set(adjusted_breakpoints)) # Ensure breakpoints are unique and sorted
|
66 |
-
|
67 |
while i < len(lines):
|
68 |
line = lines[i]
|
69 |
new_token_count = count_tokens(line, self.encoding_name)
|
70 |
if token_count + new_token_count > token_limit:
|
71 |
-
|
72 |
# Set the stop line to the last breakpoint before the current line
|
73 |
if i in breakpoints:
|
74 |
stop_line = i
|
@@ -79,20 +80,20 @@ class CodeChunker(Chunker):
|
|
79 |
if stop_line == start_line and i not in breakpoints:
|
80 |
token_count += new_token_count
|
81 |
i += 1
|
82 |
-
|
83 |
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
84 |
elif stop_line == start_line and i == stop_line:
|
85 |
token_count += new_token_count
|
86 |
i += 1
|
87 |
-
|
88 |
-
|
89 |
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
90 |
elif stop_line == start_line and i in breakpoints:
|
91 |
current_chunk = "\n".join(lines[start_line:stop_line])
|
92 |
if current_chunk.strip(): # If the current chunk is not just whitespace
|
93 |
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
94 |
chunk_number += 1
|
95 |
-
|
96 |
token_count = 0
|
97 |
start_line = i
|
98 |
i += 1
|
@@ -103,7 +104,7 @@ class CodeChunker(Chunker):
|
|
103 |
if current_chunk.strip():
|
104 |
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
105 |
chunk_number += 1
|
106 |
-
|
107 |
i = stop_line
|
108 |
token_count = 0
|
109 |
start_line = stop_line
|
@@ -116,9 +117,8 @@ class CodeChunker(Chunker):
|
|
116 |
current_chunk_code = "\n".join(lines[start_line:])
|
117 |
if current_chunk_code.strip(): # Checks if the chunk is not just whitespace
|
118 |
chunks[chunk_number] = current_chunk_code # Using chunk_number as key
|
119 |
-
|
120 |
return chunks
|
121 |
|
122 |
def get_chunk(self, chunked_codebase, chunk_number):
|
123 |
return chunked_codebase[chunk_number]
|
124 |
-
|
|
|
3 |
from utils import count_tokens
|
4 |
|
5 |
|
|
|
6 |
class Chunker(ABC):
|
7 |
def __init__(self, encoding_name="gpt-4"):
|
8 |
self.encoding_name = encoding_name
|
|
|
19 |
def print_chunks(chunks):
|
20 |
for chunk_number, chunk_code in chunks.items():
|
21 |
print(f"Chunk {chunk_number}:")
|
22 |
+
print("=" * 40)
|
23 |
print(chunk_code)
|
24 |
+
print("=" * 40)
|
25 |
|
26 |
@staticmethod
|
27 |
def consolidate_chunks_into_file(chunks):
|
28 |
return "\n".join(chunks.values())
|
29 |
+
|
30 |
@staticmethod
|
31 |
def count_lines(consolidated_chunks):
|
32 |
lines = consolidated_chunks.split("\n")
|
33 |
return len(lines)
|
34 |
|
35 |
+
|
36 |
class CodeChunker(Chunker):
|
37 |
def __init__(self, file_extension, encoding_name="gpt-4"):
|
38 |
super().__init__(encoding_name)
|
|
|
60 |
if highest_comment_line: # If a highest comment line exists, add it
|
61 |
adjusted_breakpoints.append(highest_comment_line)
|
62 |
else:
|
63 |
+
adjusted_breakpoints.append(
|
64 |
+
bp) # If no comments were found before the breakpoint, add the original breakpoint
|
65 |
|
66 |
breakpoints = sorted(set(adjusted_breakpoints)) # Ensure breakpoints are unique and sorted
|
67 |
+
|
68 |
while i < len(lines):
|
69 |
line = lines[i]
|
70 |
new_token_count = count_tokens(line, self.encoding_name)
|
71 |
if token_count + new_token_count > token_limit:
|
72 |
+
|
73 |
# Set the stop line to the last breakpoint before the current line
|
74 |
if i in breakpoints:
|
75 |
stop_line = i
|
|
|
80 |
if stop_line == start_line and i not in breakpoints:
|
81 |
token_count += new_token_count
|
82 |
i += 1
|
83 |
+
|
84 |
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
85 |
elif stop_line == start_line and i == stop_line:
|
86 |
token_count += new_token_count
|
87 |
i += 1
|
88 |
+
|
89 |
+
|
90 |
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
91 |
elif stop_line == start_line and i in breakpoints:
|
92 |
current_chunk = "\n".join(lines[start_line:stop_line])
|
93 |
if current_chunk.strip(): # If the current chunk is not just whitespace
|
94 |
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
95 |
chunk_number += 1
|
96 |
+
|
97 |
token_count = 0
|
98 |
start_line = i
|
99 |
i += 1
|
|
|
104 |
if current_chunk.strip():
|
105 |
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
106 |
chunk_number += 1
|
107 |
+
|
108 |
i = stop_line
|
109 |
token_count = 0
|
110 |
start_line = stop_line
|
|
|
117 |
current_chunk_code = "\n".join(lines[start_line:])
|
118 |
if current_chunk_code.strip(): # Checks if the chunk is not just whitespace
|
119 |
chunks[chunk_number] = current_chunk_code # Using chunk_number as key
|
120 |
+
|
121 |
return chunks
|
122 |
|
123 |
def get_chunk(self, chunked_codebase, chunk_number):
|
124 |
return chunked_codebase[chunk_number]
|
|
LICENSE
CHANGED
@@ -1,201 +1,21 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
-
exercising permissions granted by this License.
|
25 |
-
|
26 |
-
"Source" form shall mean the preferred form for making modifications,
|
27 |
-
including but not limited to software source code, documentation
|
28 |
-
source, and configuration files.
|
29 |
-
|
30 |
-
"Object" form shall mean any form resulting from mechanical
|
31 |
-
transformation or translation of a Source form, including but
|
32 |
-
not limited to compiled object code, generated documentation,
|
33 |
-
and conversions to other media types.
|
34 |
-
|
35 |
-
"Work" shall mean the work of authorship, whether in Source or
|
36 |
-
Object form, made available under the License, as indicated by a
|
37 |
-
copyright notice that is included in or attached to the work
|
38 |
-
(an example is provided in the Appendix below).
|
39 |
-
|
40 |
-
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
-
form, that is based on (or derived from) the Work and for which the
|
42 |
-
editorial revisions, annotations, elaborations, or other modifications
|
43 |
-
represent, as a whole, an original work of authorship. For the purposes
|
44 |
-
of this License, Derivative Works shall not include works that remain
|
45 |
-
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
-
the Work and Derivative Works thereof.
|
47 |
-
|
48 |
-
"Contribution" shall mean any work of authorship, including
|
49 |
-
the original version of the Work and any modifications or additions
|
50 |
-
to that Work or Derivative Works thereof, that is intentionally
|
51 |
-
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
-
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
-
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
-
means any form of electronic, verbal, or written communication sent
|
55 |
-
to the Licensor or its representatives, including but not limited to
|
56 |
-
communication on electronic mailing lists, source code control systems,
|
57 |
-
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
-
Licensor for the purpose of discussing and improving the Work, but
|
59 |
-
excluding communication that is conspicuously marked or otherwise
|
60 |
-
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
-
|
62 |
-
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
-
on behalf of whom a Contribution has been received by Licensor and
|
64 |
-
subsequently incorporated within the Work.
|
65 |
-
|
66 |
-
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
-
this License, each Contributor hereby grants to You a perpetual,
|
68 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
-
copyright license to reproduce, prepare Derivative Works of,
|
70 |
-
publicly display, publicly perform, sublicense, and distribute the
|
71 |
-
Work and such Derivative Works in Source or Object form.
|
72 |
-
|
73 |
-
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
-
this License, each Contributor hereby grants to You a perpetual,
|
75 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
-
(except as stated in this section) patent license to make, have made,
|
77 |
-
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
-
where such license applies only to those patent claims licensable
|
79 |
-
by such Contributor that are necessarily infringed by their
|
80 |
-
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
-
with the Work to which such Contribution(s) was submitted. If You
|
82 |
-
institute patent litigation against any entity (including a
|
83 |
-
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
-
or a Contribution incorporated within the Work constitutes direct
|
85 |
-
or contributory patent infringement, then any patent licenses
|
86 |
-
granted to You under this License for that Work shall terminate
|
87 |
-
as of the date such litigation is filed.
|
88 |
-
|
89 |
-
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
-
Work or Derivative Works thereof in any medium, with or without
|
91 |
-
modifications, and in Source or Object form, provided that You
|
92 |
-
meet the following conditions:
|
93 |
-
|
94 |
-
(a) You must give any other recipients of the Work or
|
95 |
-
Derivative Works a copy of this License; and
|
96 |
-
|
97 |
-
(b) You must cause any modified files to carry prominent notices
|
98 |
-
stating that You changed the files; and
|
99 |
-
|
100 |
-
(c) You must retain, in the Source form of any Derivative Works
|
101 |
-
that You distribute, all copyright, patent, trademark, and
|
102 |
-
attribution notices from the Source form of the Work,
|
103 |
-
excluding those notices that do not pertain to any part of
|
104 |
-
the Derivative Works; and
|
105 |
-
|
106 |
-
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
-
distribution, then any Derivative Works that You distribute must
|
108 |
-
include a readable copy of the attribution notices contained
|
109 |
-
within such NOTICE file, excluding those notices that do not
|
110 |
-
pertain to any part of the Derivative Works, in at least one
|
111 |
-
of the following places: within a NOTICE text file distributed
|
112 |
-
as part of the Derivative Works; within the Source form or
|
113 |
-
documentation, if provided along with the Derivative Works; or,
|
114 |
-
within a display generated by the Derivative Works, if and
|
115 |
-
wherever such third-party notices normally appear. The contents
|
116 |
-
of the NOTICE file are for informational purposes only and
|
117 |
-
do not modify the License. You may add Your own attribution
|
118 |
-
notices within Derivative Works that You distribute, alongside
|
119 |
-
or as an addendum to the NOTICE text from the Work, provided
|
120 |
-
that such additional attribution notices cannot be construed
|
121 |
-
as modifying the License.
|
122 |
-
|
123 |
-
You may add Your own copyright statement to Your modifications and
|
124 |
-
may provide additional or different license terms and conditions
|
125 |
-
for use, reproduction, or distribution of Your modifications, or
|
126 |
-
for any such Derivative Works as a whole, provided Your use,
|
127 |
-
reproduction, and distribution of the Work otherwise complies with
|
128 |
-
the conditions stated in this License.
|
129 |
-
|
130 |
-
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
-
any Contribution intentionally submitted for inclusion in the Work
|
132 |
-
by You to the Licensor shall be under the terms and conditions of
|
133 |
-
this License, without any additional terms or conditions.
|
134 |
-
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
-
the terms of any separate license agreement you may have executed
|
136 |
-
with Licensor regarding such Contributions.
|
137 |
-
|
138 |
-
6. Trademarks. This License does not grant permission to use the trade
|
139 |
-
names, trademarks, service marks, or product names of the Licensor,
|
140 |
-
except as required for reasonable and customary use in describing the
|
141 |
-
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
-
|
143 |
-
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
-
agreed to in writing, Licensor provides the Work (and each
|
145 |
-
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
-
implied, including, without limitation, any warranties or conditions
|
148 |
-
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
-
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
-
appropriateness of using or redistributing the Work and assume any
|
151 |
-
risks associated with Your exercise of permissions under this License.
|
152 |
-
|
153 |
-
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
-
whether in tort (including negligence), contract, or otherwise,
|
155 |
-
unless required by applicable law (such as deliberate and grossly
|
156 |
-
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
-
liable to You for damages, including any direct, indirect, special,
|
158 |
-
incidental, or consequential damages of any character arising as a
|
159 |
-
result of this License or out of the use or inability to use the
|
160 |
-
Work (including but not limited to damages for loss of goodwill,
|
161 |
-
work stoppage, computer failure or malfunction, or any and all
|
162 |
-
other commercial damages or losses), even if such Contributor
|
163 |
-
has been advised of the possibility of such damages.
|
164 |
-
|
165 |
-
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
-
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
-
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
-
or other liability obligations and/or rights consistent with this
|
169 |
-
License. However, in accepting such obligations, You may act only
|
170 |
-
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
-
of any other Contributor, and only if You agree to indemnify,
|
172 |
-
defend, and hold each Contributor harmless for any liability
|
173 |
-
incurred by, or claims asserted against, such Contributor by reason
|
174 |
-
of your accepting any such warranty or additional liability.
|
175 |
-
|
176 |
-
END OF TERMS AND CONDITIONS
|
177 |
-
|
178 |
-
APPENDIX: How to apply the Apache License to your work.
|
179 |
-
|
180 |
-
To apply the Apache License to your work, attach the following
|
181 |
-
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
-
replaced with your own identifying information. (Don't include
|
183 |
-
the brackets!) The text should be enclosed in the appropriate
|
184 |
-
comment syntax for the file format. We also recommend that a
|
185 |
-
file or class name and description of purpose be included on the
|
186 |
-
same "printed page" as the copyright notice for easier
|
187 |
-
identification within third-party archives.
|
188 |
-
|
189 |
-
Copyright [yyyy] [name of copyright owner]
|
190 |
-
|
191 |
-
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
-
you may not use this file except in compliance with the License.
|
193 |
-
You may obtain a copy of the License at
|
194 |
-
|
195 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
-
|
197 |
-
Unless required by applicable law or agreed to in writing, software
|
198 |
-
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
-
See the License for the specific language governing permissions and
|
201 |
-
limitations under the License.
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 CINTRAAI Code Chunker
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -1,3 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# CintraAI Code Chunker
|
2 |
|
3 |
Cintra's Code Chunker is a novel open-source tool designed to enhance code readability and maintainability by intelligently chunking code files based on key points of interest. This tool leverages advanced parsing techniques to identify significant elements in your code, such as functions, classes, and comments, to organize your codebase into manageable, easily understandable chunks. It's an invaluable resource for applications such as RAG, code patching, and other use cases.
|
@@ -7,24 +20,29 @@ Cintra's Code Chunker is a novel open-source tool designed to enhance code reada
|
|
7 |
- **Intelligent Chunking:** Break down your code files into chunks around key points of interest like function definitions, class declarations, and crucial comments.
|
8 |
- **Customizable Token Limits:** Control the size of each chunk with customizable token limits, ensuring that chunks remain manageable and focused.
|
9 |
- **Support for Multiple Languages:** Initially supporting Python, JavaScript, and CSS, with plans to expand to more programming languages.
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
## Getting Started
|
12 |
|
13 |
### Prerequisites
|
14 |
|
15 |
- Python 3.8+
|
16 |
-
- OpenAI API key (for token counting features)
|
17 |
|
18 |
### Installation
|
19 |
|
20 |
1. Clone the repository:
|
21 |
```sh
|
22 |
-
git clone https://github.com/yourgithubusername/code-chunker
|
23 |
```
|
24 |
|
25 |
2. Navigate to the project directory
|
26 |
```sh
|
27 |
-
|
28 |
```
|
29 |
4. Install the required dependencies
|
30 |
```sh
|
@@ -35,8 +53,6 @@ pip install -r requirements.txt
|
|
35 |
Use the CodeChunker class to chunk a specific code file. You can specify the file extension and token limit for chunking.
|
36 |
Example:
|
37 |
```py
|
38 |
-
from backend.app.util.TextChunker.Chunker import CodeChunker
|
39 |
-
|
40 |
chunker = CodeChunker(file_extension='py', encoding_name='gpt-4')
|
41 |
chunks = chunker.chunk(your_code_here, token_limit=1000)
|
42 |
CodeChunker.print_chunks(chunks)
|
@@ -46,21 +62,26 @@ CodeChunker.print_chunks(chunks)
|
|
46 |
The CodeParser class allows you to parse code to identify points of interest and comments, which can then be used for chunking or other analysis.
|
47 |
Example:
|
48 |
```
|
49 |
-
from backend.app.util.CodeParsing.CodeParser import CodeParser
|
50 |
-
|
51 |
parser = CodeParser(['py'])
|
52 |
tree = parser.parse_code(your_code_here, 'py')
|
53 |
points_of_interest = parser.extract_points_of_interest(tree, 'py')
|
54 |
```
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
## Contributing
|
57 |
We welcome contributions from the community, whether it's through reporting bugs, submitting feature requests, or sending pull requests. Please check the CONTRIBUTING.md file for more details on how to contribute to the project.
|
58 |
|
59 |
## License
|
60 |
-
This project is licensed under the
|
61 |
|
62 |
## Acknowledgments
|
63 |
- This project utilizes the tree-sitter project for parsing code.
|
64 |
- This also uses tiktoken to count tokens for determining chunk sizes.
|
65 |
|
66 |
-
|
|
|
1 |
+
---
|
2 |
+
title: CintraAI Code Chunker
|
3 |
+
emoji: 🧩
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.33.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
|
14 |
# CintraAI Code Chunker
|
15 |
|
16 |
Cintra's Code Chunker is a novel open-source tool designed to enhance code readability and maintainability by intelligently chunking code files based on key points of interest. This tool leverages advanced parsing techniques to identify significant elements in your code, such as functions, classes, and comments, to organize your codebase into manageable, easily understandable chunks. It's an invaluable resource for applications such as RAG, code patching, and other use cases.
|
|
|
20 |
- **Intelligent Chunking:** Break down your code files into chunks around key points of interest like function definitions, class declarations, and crucial comments.
|
21 |
- **Customizable Token Limits:** Control the size of each chunk with customizable token limits, ensuring that chunks remain manageable and focused.
|
22 |
- **Support for Multiple Languages:** Initially supporting Python, JavaScript, and CSS, with plans to expand to more programming languages.
|
23 |
+
|
24 |
+
## Try Out Code Chunker!
|
25 |
+
|
26 |
+
Interested in seeing how it works? Check out our interactive demo on **Hugging Face Spaces**.
|
27 |
+
|
28 |
+
[**Click here to try it out!**](https://huggingface.co/spaces/CintraAI/code-chunker)
|
29 |
|
30 |
## Getting Started
|
31 |
|
32 |
### Prerequisites
|
33 |
|
34 |
- Python 3.8+
|
|
|
35 |
|
36 |
### Installation
|
37 |
|
38 |
1. Clone the repository:
|
39 |
```sh
|
40 |
+
git clone https://github.com/yourgithubusername/code-chunker.git
|
41 |
```
|
42 |
|
43 |
2. Navigate to the project directory
|
44 |
```sh
|
45 |
+
cd code-chunker
|
46 |
```
|
47 |
4. Install the required dependencies
|
48 |
```sh
|
|
|
53 |
Use the CodeChunker class to chunk a specific code file. You can specify the file extension and token limit for chunking.
|
54 |
Example:
|
55 |
```py
|
|
|
|
|
56 |
chunker = CodeChunker(file_extension='py', encoding_name='gpt-4')
|
57 |
chunks = chunker.chunk(your_code_here, token_limit=1000)
|
58 |
CodeChunker.print_chunks(chunks)
|
|
|
62 |
The CodeParser class allows you to parse code to identify points of interest and comments, which can then be used for chunking or other analysis.
|
63 |
Example:
|
64 |
```
|
|
|
|
|
65 |
parser = CodeParser(['py'])
|
66 |
tree = parser.parse_code(your_code_here, 'py')
|
67 |
points_of_interest = parser.extract_points_of_interest(tree, 'py')
|
68 |
```
|
69 |
|
70 |
+
3. Understanding the Token Limit in Chunking:
|
71 |
+
|
72 |
+
In the `chunk` method of the `Chunker` class, a `token_limit` parameter is used to control the size of each chunk of code. A 'token' can be thought of as the smallest unit of processing. In the context of text processing, a token could be a word, a sentence, or a similar unit.
|
73 |
+
|
74 |
+
The `token_limit` parameter limits the number of these tokens for each chunk. If the limit is, for instance, 100 tokens, that means each chunk of content produced by the `chunk` method should contain no more than 100 tokens.
|
75 |
+
|
76 |
+
It is worth noting that the way content is tokenized and how a token is defined depends on the specific implementation and the type of content being processed.
|
77 |
+
|
78 |
## Contributing
|
79 |
We welcome contributions from the community, whether it's through reporting bugs, submitting feature requests, or sending pull requests. Please check the CONTRIBUTING.md file for more details on how to contribute to the project.
|
80 |
|
81 |
## License
|
82 |
+
This project is licensed under the MIT license. See the License file for details
|
83 |
|
84 |
## Acknowledgments
|
85 |
- This project utilizes the tree-sitter project for parsing code.
|
86 |
- This also uses tiktoken to count tokens for determining chunk sizes.
|
87 |
|
|
__pycache__/Chunker.cpython-312.pyc
CHANGED
Binary files a/__pycache__/Chunker.cpython-312.pyc and b/__pycache__/Chunker.cpython-312.pyc differ
|
|
__pycache__/test_code_chunker.cpython-312.pyc
CHANGED
Binary files a/__pycache__/test_code_chunker.cpython-312.pyc and b/__pycache__/test_code_chunker.cpython-312.pyc differ
|
|
__pycache__/utils.cpython-312.pyc
CHANGED
Binary files a/__pycache__/utils.cpython-312.pyc and b/__pycache__/utils.cpython-312.pyc differ
|
|
app.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from Chunker import CodeChunker
|
5 |
+
|
6 |
+
# Set Streamlit page config at the very beginning
|
7 |
+
st.set_page_config(page_title="Cintra Code Chunker", layout="wide")
|
8 |
+
|
9 |
+
# Function to load JSON data
|
10 |
+
def load_json_file(file_path):
|
11 |
+
with open(file_path, 'r') as file:
|
12 |
+
return json.load(file)
|
13 |
+
|
14 |
+
# Function to read code from an uploaded file
|
15 |
+
def read_code_from_file(uploaded_file):
|
16 |
+
return uploaded_file.getvalue().decode("utf-8")
|
17 |
+
|
18 |
+
st.link_button('Contribute on GitHub', 'https://github.com/CintraAI/code-chunker', help=None, type="secondary", disabled=False, use_container_width=False)
|
19 |
+
|
20 |
+
json_file_path = os.path.join(os.path.dirname(__file__), 'mock_codefiles.json')
|
21 |
+
code_files_data = load_json_file(json_file_path)
|
22 |
+
|
23 |
+
# Extract filenames and contents
|
24 |
+
code_files = list(code_files_data.keys())
|
25 |
+
|
26 |
+
st.title('Cintra Code Chunker')
|
27 |
+
|
28 |
+
selection_col, upload_col = st.columns(2)
|
29 |
+
with selection_col:
|
30 |
+
# File selection dropdown
|
31 |
+
selected_file_name = st.selectbox("Select an example code file", code_files)
|
32 |
+
|
33 |
+
with upload_col:
|
34 |
+
# File upload
|
35 |
+
uploaded_file = st.file_uploader("Or upload your code file", type=['py', 'js', 'css', 'jsx'])
|
36 |
+
|
37 |
+
# Determine the content and file extension based on selection or upload
|
38 |
+
if uploaded_file is not None:
|
39 |
+
code_content = read_code_from_file(uploaded_file)
|
40 |
+
file_extension = uploaded_file.name.split('.')[-1]
|
41 |
+
else:
|
42 |
+
code_content = code_files_data.get(selected_file_name, "")
|
43 |
+
file_extension = selected_file_name.split('.')[-1] if selected_file_name else None
|
44 |
+
|
45 |
+
# Determine the language for syntax highlighting
|
46 |
+
def get_language_by_extension(file_extension):
|
47 |
+
if file_extension in ['py', 'python']:
|
48 |
+
return 'python'
|
49 |
+
elif file_extension in ['js', 'jsx', 'javascript']:
|
50 |
+
return 'javascript'
|
51 |
+
elif file_extension == 'css':
|
52 |
+
return 'css'
|
53 |
+
else:
|
54 |
+
return None
|
55 |
+
|
56 |
+
language = get_language_by_extension(file_extension)
|
57 |
+
|
58 |
+
st.write("""
|
59 |
+
### Choose Chunk Size Target""")
|
60 |
+
token_chunk_size = st.number_input('Target Chunk Size Target', min_value=5, max_value=1000, value=25, help="The token limit guides the chunk size in tokens (tiktoken, gpt-4), aiming for readability without enforcing a strict upper limit.")
|
61 |
+
|
62 |
+
with st.expander("Learn more about the chunk size target"):
|
63 |
+
st.markdown("""
|
64 |
+
The `token_limit` parameter in the `chunk` function serves as a guideline to optimize the size of code chunks produced. It is not a hard limit but rather an ideal target, attempting to achieve a balance between chunk size and maintaining logical coherence within the code.
|
65 |
+
|
66 |
+
- **Adherence to Logical Breakpoints:** The chunking logic respects logical breakpoints in the code, ensuring that chunks are coherent and maintain readability.
|
67 |
+
- **Flexibility in Chunk Size:** Chunks might be slightly smaller or larger than the specified `token_limit` to avoid breaking the code in the middle of logical sections.
|
68 |
+
- **Handling Final Chunks:** The last chunk of code captures any remaining code, which may vary significantly in size depending on the remaining code's structure.
|
69 |
+
|
70 |
+
This approach allows for flexibility in how code is segmented into chunks, emphasizing the balance between readable, logical code segments and size constraints.
|
71 |
+
""")
|
72 |
+
|
73 |
+
original_col, chunked_col = st.columns(2)
|
74 |
+
|
75 |
+
with original_col:
|
76 |
+
st.subheader('Original File')
|
77 |
+
st.code(code_content, language=language)
|
78 |
+
|
79 |
+
# Initialize the code chunker
|
80 |
+
code_chunker = CodeChunker(file_extension=file_extension)
|
81 |
+
|
82 |
+
# Chunk the code content
|
83 |
+
chunked_code_dict = code_chunker.chunk(code_content, token_chunk_size)
|
84 |
+
|
85 |
+
with chunked_col:
|
86 |
+
st.subheader('Chunked Code')
|
87 |
+
for chunk_key, chunk_code in chunked_code_dict.items():
|
88 |
+
st.code(chunk_code, language=language)
|
cintra/documentation.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Chunker.py": "Description: This file provides functionality for chunking code into manageable segments based on token limits and breakpoints.\n\nDependencies: CodeParser, Utils\n\nFunctions:\n- Name: chunk\n Description: Divides code into chunks based on token limits and breakpoints.\n Parameters: content (str) - The code content to be chunked, token_limit (int) - The maximum number of tokens allowed in a chunk.\n Returns: dict - A dictionary containing the chunked code segments.\n\n- Name: get_chunk\n Description: Retrieves a specific chunk from the chunked codebase.\n Parameters: chunked_content (dict) - The dictionary of chunked code segments, chunk_number (int) - The number of the desired chunk.\n Returns: str - The code segment corresponding to the specified chunk number.\n\n- Name: print_chunks\n Description: Prints out the chunked code segments with a header for each chunk.\n Parameters: chunks (dict) - The dictionary of chunked code segments.\n Returns: None\n\n- Name: consolidate_chunks_into_file\n Description: Combines all chunked code segments into a single string for output.\n Parameters: chunks (dict) - The dictionary of chunked code segments.\n Returns: str - The consolidated code segments as a single string.\n\n- Name: count_lines\n Description: Counts the number of lines in the consolidated code segments.\n Parameters: consolidated_chunks (str) - The consolidated code segments as a single string.\n Returns: int - The total number of lines in the consolidated code.",
|
3 |
+
"CodeParser.py": "Description: This file serves as a code parser for various programming languages, allowing for the extraction of points of interest and comments from code files.\n\nDependencies: os, subprocess, typing, tree_sitter, logging\n\nFunctions:\n- Name: \\_\\_init\\_\\_\n Description: Initializes the CodeParser object with the specified file extensions.\n Parameters: file_extensions (Union[None, List[str], str]) - The file extensions to parse.\n Returns: None\n\n- Name: parse_code\n Description: Parses the provided code based on the file extension.\n Parameters: code (str) - The code to parse, file_extension (str) - The extension of the code file.\n Returns: Union[None, Node] - The root node of the parsed code.\n\n- Name: extract_points_of_interest\n Description: Recursively extracts points of interest from the parsed code.\n Parameters: node (Node) - The current node being processed, file_extension (str) - The extension of the code file.\n Returns: List[Tuple[Node, str]] - A list of tuples containing nodes of interest and their types.\n\n- Name: extract_comments\n Description: Recursively extracts comments from the parsed code.\n Parameters: node (Node) - The current node being processed, file_extension (str) - The extension of the code file.\n Returns: List[Tuple[Node, str]] - A list of tuples containing comments and their types.\n\n- Name: get_lines_for_points_of_interest\n Description: Retrieves the line numbers of points of interest in the code.\n Parameters: code (str) - The code to analyze, file_extension (str) - The extension of the code file.\n Returns: List[int] - A list of line numbers with points of interest.\n\n- Name: get_lines_for_comments\n Description: Retrieves the line numbers of comments in the code.\n Parameters: code (str) - The code to analyze, file_extension (str) - The extension of the code file.\n Returns: List[int] - A list of line numbers with comments.\n\n- Name: print_all_line_types\n Description: Prints the line numbers with their corresponding node types.\n Parameters: code (str) - The code to analyze, file_extension (str) - The extension of the code file.\n Returns: None\n\n- Name: map_line_to_node_type\n Description: Maps line numbers to node types recursively.\n Parameters: node (Node) - The current node being processed, line_to_node_type (Dict) - Mapping of line numbers to node types, depth (int) - The depth of recursion.\n Returns: Dict[int, List[str]] - A dictionary mapping line numbers to node types.\n\n- Name: print_simple_line_numbers_with_code\n Description: Prints the line numbers with their corresponding code lines.\n Parameters: code (str) - The code to display.\n Returns: None",
|
4 |
+
"app.py": "Description: This file utilizes the Streamlit library to create a simple user interface for selecting a value and displaying its square.\n\nDependencies: streamlit\n\nFunctions:\n- Name: slider\n Description: Creates a slider widget for selecting a numerical value.\n Parameters: None\n Returns: The selected value (int or float)\n\n- Name: write\n Description: Writes the provided values to the user interface.\n Parameters: values (any) - The values to be displayed.\n Returns: None",
|
5 |
+
"test_code_chunker.py": "Description: This file contains unit tests for the CodeChunker class, which is responsible for chunking code files based on token limits.\n\nDependencies: unittest, unittest.mock, Chunker, CodeChunker, load_json, tiktoken, json, os\n\nFunctions:\n- Name: mock_count_tokens\n Description: Mocks the count_tokens function to return the number of tokens in a text string.\n Parameters: string (str) - The text string to count tokens for, encoding_name (str) - The name of the encoding model (default: 'gpt-4').\n Returns: int - The number of tokens in the text string.\n\n- Name: TestCodeChunkerPython\n Description: Python test class for testing code chunking functionality.\n Parameters: N/A\n Returns: N/A\n\n- Name: TestCodeChunkerJavaScript\n Description: JavaScript test class for testing code chunking functionality.\n Parameters: N/A\n Returns: N/A\n\n- Name: TestCodeChunkerCSS\n Description: CSS test class for testing code chunking functionality.\n Parameters: N/A\n Returns: N/A\n\nEach test method within the test classes follows a similar structure:\n- Description: Tests chunking code with specific characteristics (e.g., simple code, routes, models, main, utilities, big class, react component, media query, simple styles).\n- Parameters: py_code/js_code/css_code (str) - The code to be chunked, token_limit (int) - The limit of tokens per chunk.\n- Returns: N/A",
|
6 |
+
"utils.py": "Description: This file provides functions for tokenizing text strings and loading JSON files.\n\nDependencies: tiktoken, json\n\nFunctions:\n- Name: count_tokens\n Description: Returns the number of tokens in a text string.\n Parameters: string (str) - The text string to tokenize, encoding_name (str) - The name of the encoding model to use.\n Returns: int - The number of tokens in the text string.\n\n- Name: load_json\n Description: Loads and parses a JSON file.\n Parameters: json_file (str) - The path to the JSON file to load.\n Returns: dict - The parsed JSON data from the file."
|
7 |
+
}
|
mock_codefiles.json
CHANGED
@@ -1,27 +1,30 @@
|
|
1 |
{
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
}
|
|
|
|
|
|
|
|
1 |
{
|
2 |
+
"simple.py": "import sys\n\n# This is a sample Python file\n\ndef main():\n print('Hello, world!')\n\nif __name__ == '__main__':\n main()",
|
3 |
+
"text_only.py": "# This file is empty and should test the chunker's ability to handle empty files\n",
|
4 |
+
"routes.py": "from flask import Flask, jsonify, request, redirect, url_for\napp = Flask(__name__)\n\n@app.route('/', methods=['GET'])\ndef home():\n return '<h1>Welcome to the Home Page</h1>', 200\n\n@authenticate # Hypothetical decorator for authentication\n@log_access # Hypothetical decorator for logging access\n@app.route('/api/data', methods=['GET'])\ndef get_data():\n # Simulate fetching data from a database or external service\n data = {'key': 'This is some data'}\n return jsonify(data), 200\n\n@app.route('/api/data/<int:data_id>', methods=['GET'])\ndef get_data_by_id(data_id):\n # Simulate fetching specific data by ID\n data = {'id': data_id, 'value': 'Specific data based on ID'}\n return jsonify(data), 200\n\n@app.route('/api/data', methods=['POST'])\ndef post_data():\n data = request.json\n # Simulate saving data to a database\n return jsonify({'message': 'Data saved successfully', 'data': data}), 201\n\n@app.route('/api/data/<int:data_id>', methods=['PUT'])\ndef update_data(data_id):\n data = request.json\n # Simulate updating data in a database\n return jsonify({'message': 'Data updated successfully', 'id': data_id, 'data': data}), 200\n\n@app.route('/api/data/<int:data_id>', methods=['DELETE'])\ndef delete_data(data_id):\n # Simulate deleting data by ID\n return jsonify({'message': 'Data deleted successfully', 'id': data_id}), 200\n\n@app.route('/redirect', methods=['GET'])\ndef example_redirect():\n return redirect(url_for('home'))\n\nif __name__ == '__main__':\n app.run(debug=True)",
|
5 |
+
"models.py": "from sqlalchemy import Column, Integer, String, ForeignKey\nfrom sqlalchemy.ext.declarative import declarative_base\nfrom sqlalchemy.orm import relationship\n\nBase = declarative_base()\n\nclass User(Base):\n __tablename__ = 'users'\n id = Column(Integer, primary_key=True)\n username = Column(String, unique=True, nullable=False)\n email = Column(String, unique=True, nullable=False)\n\n posts = relationship('Post', backref='author')\n\nclass Post(Base):\n __tablename__ = 'posts'\n id = Column(Integer, primary_key=True)\n title = Column(String, nullable=False)\n content = Column(String, nullable=False)\n user_id = Column(Integer, ForeignKey('users.id'))",
|
6 |
+
"big_class.py": "class BigClass:\n def __init__(self, name, age):\n self.name = name\n self.age = age\n\n def get_name(self):\n return self.name\n\n def get_age(self):\n return self.age\n\n def set_name(self, name):\n self.name = name\n\n def set_age(self, age):\n self.age = age\n\n def __str__(self):\n return f'Name: {self.name}, Age: {self.age}'",
|
7 |
+
"main.py": "from flask import Flask\nfrom routes import app as routes_app\n\n# Create the Flask application\napp = Flask(__name__)\n\n# Register the routes from the routes.py file\napp.register_blueprint(routes_app)\n\n# Configuration settings for the app can go here\napp.config['DEBUG'] = True\napp.config['SECRET_KEY'] = 'your_secret_key'\n\n# More complex app initialization steps can be added here\n# For example, database initialization, login manager setups, etc.\n\n# This function can be used to create a database schema\n def create_database(app):\n if not path.exists('yourdatabase.db'):\n db.create_all(app=app)\n print('Created Database!')\n\nif __name__ == '__main__':\n # Optionally, call database creation or other setup functions here\n create_database(app)\n app.run()",
|
8 |
+
"utilities.py": "import hashlib\nimport uuid\nimport re\nfrom datetime import datetime, timedelta\n\n# Function to hash a password\ndef hash_password(password):\n salt = uuid.uuid4().hex\n return hashlib.sha256(salt.encode() + password.encode()).hexdigest() + ':' + salt\n\n# Function to check a hashed password\ndef check_password(hashed_password, user_password):\n password, salt = hashed_password.split(':')\n return password == hashlib.sha256(salt.encode() + user_password.encode()).hexdigest()\n\n# Function to validate an email address\ndef validate_email(email):\n pattern = r\"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$\"\n return re.match(pattern, email) is not None\n\n# Function to generate a token expiration date\ndef generate_expiration_date(days=1):\n return datetime.now() + timedelta(days=days)\n\n# Function to convert a string to datetime\ndef string_to_datetime(date_string, format='%Y-%m-%d %H:%M:%S'):\n return datetime.strptime(date_string, format)\n\n# Function to convert datetime to string\ndef datetime_to_string(date, format='%Y-%m-%d %H:%M:%S'):\n return date.strftime(format)",
|
9 |
+
"services.py": "import requests\nfrom flask_sqlalchemy import SQLAlchemy\n\n# Assuming an initialized Flask app with SQLAlchemy\ndb = SQLAlchemy()\n\n# Example of a model for database operations\nclass UserData(db.Model):\n id = db.Column(db.Integer, primary_key=True)\n name = db.Column(db.String(100), nullable=False)\n email = db.Column(db.String(100), unique=True, nullable=False)\n\n def __repr__(self):\n return f'<User {self.name}>'\n\n# Function to fetch data from an external API\ndef fetch_external_data(api_url):\n response = requests.get(api_url)\n if response.status_code == 200:\n return response.json()\n else:\n return {'error': 'Failed to fetch data'}\n\n# Function to save user data to the database\ndef save_user_data(name, email):\n new_user = UserData(name=name, email=email)\n db.session.add(new_user)\n try:\n db.session.commit()\n return {'message': 'User saved successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}\n\n# Function to update user data in the database\ndef update_user_data(user_id, name=None, email=None):\n user = UserData.query.get(user_id)\n if not user:\n return {'error': 'User not found'}\n if name:\n user.name = name\n if email:\n user.email = email\n try:\n db.session.commit()\n return {'message': 'User updated successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}\n\n# Function to delete user data from the database\ndef delete_user_data(user_id):\n user = UserData.query.get(user_id)\n if not user:\n return {'error': 'User not found'}\n try:\n db.session.delete(user)\n db.session.commit()\n return {'message': 'User deleted successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}",
|
10 |
+
"simple.js": "// This is a sample JavaScript file\n\nfunction main() {\n console.log('Hello, world!');\n}\n\nmain();",
|
11 |
+
"text_only.js": "// This file is empty and should test the chunker's ability to handle empty files\n",
|
12 |
+
"routes.js": "const express = require('express');\nconst router = express.Router();\n\n// Example of a simple route\nrouter.get('/', (req, res) => {\n res.send('Welcome to the Home Page');\n});\n\n// Example of a route with a parameter\nrouter.get('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n res.json({ id: dataId, value: 'Specific data based on ID' });\n});\n\n// Example of a route that handles POST requests\nrouter.post('/api/data', (req, res) => {\n const data = req.body;\n res.status(201).json({ message: 'Data saved successfully', data: data });\n});\n\n// Example of a route that handles PUT requests\nrouter.put('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n const data = req.body;\n res.json({ message: 'Data updated successfully', id: dataId, data: data });\n});\n\n// Example of a route that handles DELETE requests\nrouter.delete('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n res.json({ message: 'Data deleted successfully', id: dataId });\n});\n\nmodule.exports = router;",
|
13 |
+
"models.js": "const mongoose = require('mongoose');\n\n// Example of a simple Mongoose model\nconst userSchema = new mongoose.Schema({\n username: { type: String, required: true, unique: true },\n email: { type: String, required: true, unique: true }\n});\n\nconst User = mongoose.model('User', userSchema);\n\nmodule.exports = User;",
|
14 |
+
"big_class.js": "// This is a sample JavaScript class with a large number of methods\n\nclass BigClass {\n constructor(name, age) {\n this.name = name;\n this.age = age;\n }\n\n getName() {\n return this.name;\n }\n\n getAge() {\n return this.age;\n }\n\n setName(name) {\n this.name = name;\n }\n\n setAge(age) {\n this.age = age;\n }\n\n toString() {\n return `Name: ${this.name}, Age: ${this.age}`;\n }\n}\n\nmodule.exports = BigClass;",
|
15 |
+
"main.js": "const express = require('express');\nconst routes = require('./routes');\n\n// Create the Express application\nconst app = express();\n\n// Register the routes from the routes.js file\napp.use('/', routes);\n\n// Configuration settings for the app can go here\napp.set('port', process.env.PORT || 3000);\n\n// More complex app initialization steps can be added here\n// For example, database initialization, middleware setups, etc.\n\n// This function can be used to create a database schema\nfunction createDatabase() {\n // Code to create database schema\n console.log('Created Database!');\n}\n\n// Optionally, call database creation or other setup functions here\ncreateDatabase();\n\n// Start the server\napp.listen(app.get('port'), () => {\n console.log(`Server running on port ${app.get('port')}`);\n});",
|
16 |
+
"utilities.js": "// Example of utility functions for common tasks\n\n// Function to hash a password\nfunction hashPassword(password) {\n const salt = uuidv4();\n return sha256(salt + password) + ':' + salt;\n}\n\n// Function to check a hashed password\nfunction checkPassword(hashedPassword, userPassword) {\n const [password, salt] = hashedPassword.split(':');\n return sha256(salt + userPassword) === password;\n}\n\n// Function to validate an email address\nfunction validateEmail(email) {\n const pattern = /^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$/;\n return pattern.test(email);\n}\n\n// Function to generate a token expiration date\nfunction generateExpirationDate(days = 1) {\n const expirationDate = new Date();\n expirationDate.setDate(expirationDate.getDate() + days);\n return expirationDate;\n}\n\n// Function to convert a string to a Date object\nfunction stringToDate(dateString, format = 'YYYY-MM-DD HH:mm:ss') {\n return new Date(dateString);\n}\n\n// Function to convert a Date object to a string\nfunction dateToString(date, format = 'YYYY-MM-DD HH:mm:ss') {\n return date.toISOString();\n}",
|
17 |
+
"services.js": "// Example of service functions for handling data operations\n\n// Function to fetch data from an external API\nasync function fetchExternalData(apiUrl) {\n try {\n const response = await fetch(apiUrl);\n if (response.ok) {\n return await response.json();\n } else {\n return { error: 'Failed to fetch data' };\n }\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to save user data to the database\nasync function saveUserData(name, email) {\n try {\n const newUser = new UserData({ name, email });\n await newUser.save();\n return { message: 'User saved successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to update user data in the database\nasync function updateUserData(userId, name, email) {\n try {\n const user = await UserData.findById(userId);\n if (!user) {\n return { error: 'User not found' };\n }\n if (name) {\n user.name = name;\n }\n if (email) {\n user.email = email;\n }\n await user.save();\n return { message: 'User updated successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to delete user data from the database\nasync function deleteUserData(userId) {\n try {\n const user = await UserData.findById(userId);\n if (!user) {\n return { error: 'User not found' };\n }\n await user.remove();\n return { message: 'User deleted successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}",
|
18 |
+
"react_component.js": "import React from 'react';\n\nimport './SearchResults.css';\n\nimport TrackList from '../TrackList/TrackList.js';\n\n constructor(props) {\n super(props);\n this.addTopFive = this.addTopFive.bind(this);\n this.addTopTen = this.addTopTen.bind(this);\n this.addAll = this.addAll.bind(this);\n }\n\n //add the top five tracks to the playlist\n addTopFive() {\n this.props.onAdd(this.props.searchResults.slice(0, 5));\n }\n\n //add top 10 tracks to the playlist\n addTopTen() {\n this.props.onAdd(this.props.searchResults.slice(0, 10));\n }\n\n addAll() {\n this.props.onAdd(this.props.searchResults);\n }\n render() {\n return (\n <div className=\"SearchResults\">\n <h2>Results</h2>\n <TrackList tracks={this.props.searchResults} onAdd={this.props.onAdd} onToggle={this.props.onToggle} currentTrack={this.props.currentTrack}/>\n </div>\n );\n }\n}\n\nexport default SearchResults;'",
|
19 |
+
"simple_styles.css": "/* Example of CSS styles for a web page */\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n}\n\nbutton:hover {\n background-color: #0056b3;\n}",
|
20 |
+
"media_queries.css": "/* Example of CSS styles with media queries for responsive design */\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n}\n\nbutton:hover {\n background-color: #0056b3;\n}\n\n/* Media query for smaller screens */\n@media (max-width: 768px) {\n button {\n padding: 8px 16px;\n font-size: 14px;\n }\n}",
|
21 |
+
"single_syntax_error_example.py": "# This is a sample Python file\n\nprint('Hello, world!'\n\n",
|
22 |
+
"multiple_syntax_errors.py": "def calculate_sum(lst):\n total = 0\n for num in lst\n total += num\n return total\n\nprint(calculate_sum([1, 2, 3, 4])\n\ndef string_manipulator(s):\n new_string = ''\n for char in s:\n if char == 'a':\n new_string += 'z'\n else:\n new_string += char\n return new_string\n\nprint(string_manipulate('banana'))\n\ndef find_max(numbers):\n max_num = numbers[0]\n for num in numbers\n if num > max_num\n max_num = num\n return max_num\n\nprint(find_max([1, 2, 3, 4, 5])",
|
23 |
+
"single_syntax_error_example.js": "//This is a sample JavaScript file\n\nfunction main() {\n console.log('Hello, world!');\n if (true) {\n console.log('hi');\n \n}\n\nmain();",
|
24 |
+
"multiple_syntax_errors.js": "function calculateSum(arr) {\n let total = 0;\n for (let i = 0; i < arr.length; i++ {\n total += arr[i];\n }\n return total;\n}\n\nconsole.log(calculateSum([1, 2, 3, 4);\n\nfunction stringManipulator(str) {\n let newString = '';\n for (let i = 0; i < str.length; i++) {\n if (str.charAt(i) === 'a')\n newString += 'z';\n } else {\n newString += str.charAt(i);\n }\n }\n return newString;\n}\n\nconsole.log(stringManipulator('banana'));\n\nfunction findMax(numbers) {\n let maxNum = numbers[0];\n for (let i = 1; i < numbers.length; i++) {\n if (numbers[i] > maxNum) {\n maxNum = numbers[i];\n }\n }\n return maxNum;\n}\n\nconsole.log(findMax([1, 2, 3, 4, 5]);",
|
25 |
+
"single_syntax_error_example.css": "\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n :hover {\n background-color: #0056b3;\n}\n",
|
26 |
+
"multiple_syntax_errors.css": "body {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n :hover {\n background-color: #0056b3;\n}\n\n/* Media query for smaller screens */\n@media (max-width: 768px) {\n button {\n padding: 8px 16px;\n font-size: 14px;\n }\n}",
|
27 |
+
"example.ts": "interface User {\n id: number;\n name: string;\n email: string;\n}\n\nclass UserManager {\n private users: User[] = [];\n\n addUser(user: User): void {\n this.users.push(user);\n }\n\n getUser(id: number): User | undefined {\n return this.users.find(user => user.id === id);\n }\n\n updateUser(id: number, updatedUser: Partial<User>): void {\n const userIndex = this.users.findIndex(user => user.id === id);\n if (userIndex !== -1) {\n this.users[userIndex] = { ...this.users[userIndex], ...updatedUser };\n }\n }\n\n deleteUser(id: number): void {\n this.users = this.users.filter(user => user.id !== id);\n }\n}\n\nconst userManager = new UserManager();\nuserManager.addUser({ id: 1, name: 'John Doe', email: 'john@example.com' });\nconsole.log(userManager.getUser(1));",
|
28 |
+
"example.rb": "class User\n attr_accessor :id, :name, :email\n\n def initialize(id, name, email)\n @id = id\n @name = name\n @email = email\n end\n\n def to_s\n \"User: #{@name} (#{@email})\"\n end\nend\n\nclass UserManager\n def initialize\n @users = []\n end\n\n def add_user(user)\n @users << user\n end\n\n def get_user(id)\n @users.find { |user| user.id == id }\n end\n\n def update_user(id, updated_user)\n user = get_user(id)\n user.name = updated_user.name if updated_user.name\n user.email = updated_user.email if updated_user.email\n end\n\n def delete_user(id)\n @users.delete_if { |user| user.id == id }\n end\nend\n\nuser_manager = UserManager.new\nuser_manager.add_user(User.new(1, 'John Doe', 'john@example.com'))\nputs user_manager.get_user(1)",
|
29 |
+
"example.php": "<?php\n\nclass User {\n public $id;\n public $name;\n public $email;\n\n public function __construct($id, $name, $email) {\n $this->id = $id;\n $this->name = $name;\n $this->email = $email;\n }\n}\n\nclass UserManager {\n private $users = [];\n\n public function addUser($user) {\n $this->users[] = $user;\n }\n\n public function getUser($id) {\n foreach ($this->users as $user) {\n if ($user->id === $id) {\n return $user;\n }\n }\n return null;\n }\n\n public function updateUser($id, $updatedUser) {\n foreach ($this->users as &$user) {\n if ($user->id === $id) {\n $user->name = $updatedUser->name ?? $user->name;\n $user->email = $updatedUser->email ?? $user->email;\n break;\n }\n }\n }\n\n public function deleteUser($id) {\n $this->users = array_filter($this->users, function($user) use ($id) {\n return $user->id !== $id;\n });\n }\n}\n\n$userManager = new UserManager();\n$userManager->addUser(new User(1, 'John Doe', 'john@example.com'));\nvar_dump($userManager->getUser(1));\n?>"
|
30 |
+
}
|
requirements.txt
CHANGED
@@ -1,68 +1,46 @@
|
|
1 |
-
|
2 |
-
altair==5.2.0
|
3 |
-
annotated-types==0.6.0
|
4 |
-
anyio==4.3.0
|
5 |
attrs==23.2.0
|
|
|
|
|
6 |
certifi==2024.2.2
|
7 |
charset-normalizer==3.3.2
|
8 |
click==8.1.7
|
9 |
colorama==0.4.6
|
10 |
-
|
11 |
-
|
12 |
-
fastapi==0.110.0
|
13 |
-
ffmpy==0.3.2
|
14 |
-
filelock==3.13.1
|
15 |
-
fonttools==4.49.0
|
16 |
-
fsspec==2024.2.0
|
17 |
-
gradio==4.19.2
|
18 |
-
gradio_client==0.10.1
|
19 |
-
h11==0.14.0
|
20 |
-
httpcore==1.0.4
|
21 |
-
httpx==0.27.0
|
22 |
-
huggingface-hub==0.20.3
|
23 |
idna==3.6
|
24 |
-
importlib_resources==6.1.2
|
25 |
Jinja2==3.1.3
|
26 |
jsonschema==4.21.1
|
27 |
jsonschema-specifications==2023.12.1
|
28 |
-
kiwisolver==1.4.5
|
29 |
markdown-it-py==3.0.0
|
30 |
MarkupSafe==2.1.5
|
31 |
-
matplotlib==3.8.3
|
32 |
mdurl==0.1.2
|
33 |
numpy==1.26.4
|
34 |
-
|
35 |
-
packaging==23.2
|
36 |
pandas==2.2.1
|
37 |
-
pillow==10.
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
Pygments==2.17.2
|
42 |
-
|
43 |
-
python-dateutil==2.8.2
|
44 |
-
python-multipart==0.0.9
|
45 |
pytz==2024.1
|
46 |
-
|
47 |
-
referencing==0.33.0
|
48 |
-
regex==2023.12.25
|
49 |
requests==2.31.0
|
50 |
-
rich==13.7.
|
51 |
rpds-py==0.18.0
|
52 |
-
ruff==0.2.2
|
53 |
-
semantic-version==2.10.0
|
54 |
-
setuptools==69.1.1
|
55 |
-
shellingham==1.5.4
|
56 |
six==1.16.0
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
60 |
toolz==0.12.1
|
61 |
-
|
62 |
-
|
63 |
-
typer==0.9.0
|
64 |
-
typing_extensions==4.10.0
|
65 |
tzdata==2024.1
|
66 |
urllib3==2.2.1
|
67 |
-
|
68 |
-
|
|
|
1 |
+
altair==5.3.0
|
|
|
|
|
|
|
2 |
attrs==23.2.0
|
3 |
+
blinker==1.7.0
|
4 |
+
cachetools==5.3.3
|
5 |
certifi==2024.2.2
|
6 |
charset-normalizer==3.3.2
|
7 |
click==8.1.7
|
8 |
colorama==0.4.6
|
9 |
+
gitdb==4.0.11
|
10 |
+
GitPython==3.1.43
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
idna==3.6
|
|
|
12 |
Jinja2==3.1.3
|
13 |
jsonschema==4.21.1
|
14 |
jsonschema-specifications==2023.12.1
|
|
|
15 |
markdown-it-py==3.0.0
|
16 |
MarkupSafe==2.1.5
|
|
|
17 |
mdurl==0.1.2
|
18 |
numpy==1.26.4
|
19 |
+
packaging==24.0
|
|
|
20 |
pandas==2.2.1
|
21 |
+
pillow==10.3.0
|
22 |
+
protobuf==4.25.3
|
23 |
+
pyarrow==15.0.2
|
24 |
+
pydeck==0.8.1b0
|
25 |
Pygments==2.17.2
|
26 |
+
python-dateutil==2.9.0.post0
|
|
|
|
|
27 |
pytz==2024.1
|
28 |
+
referencing==0.34.0
|
|
|
|
|
29 |
requests==2.31.0
|
30 |
+
rich==13.7.1
|
31 |
rpds-py==0.18.0
|
|
|
|
|
|
|
|
|
32 |
six==1.16.0
|
33 |
+
smmap==5.0.1
|
34 |
+
streamlit==1.33.0
|
35 |
+
tenacity==8.2.3
|
36 |
+
regex==2023.12.25
|
37 |
+
tiktoken==0.6.0
|
38 |
+
tree-sitter==0.21.3
|
39 |
+
toml==0.10.2
|
40 |
toolz==0.12.1
|
41 |
+
tornado==6.4
|
42 |
+
typing_extensions==4.11.0
|
|
|
|
|
43 |
tzdata==2024.1
|
44 |
urllib3==2.2.1
|
45 |
+
watchdog==4.0.0
|
46 |
+
setuptools==69.2.0
|
test_code_chunker.py
CHANGED
@@ -208,5 +208,75 @@ class TestCodeChunkerCSS(unittest.TestCase):
|
|
208 |
self.assertEqual(num_lines, len(css_code.split("\n")))
|
209 |
self.assertIn(css_code, final_code)
|
210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
if __name__ == '__main__':
|
212 |
unittest.main()
|
|
|
208 |
self.assertEqual(num_lines, len(css_code.split("\n")))
|
209 |
self.assertIn(css_code, final_code)
|
210 |
|
211 |
+
|
212 |
+
|
213 |
+
# TypeScript Test Class
|
214 |
+
class TestCodeChunkerTypeScript(unittest.TestCase):
|
215 |
+
|
216 |
+
def setUp(self):
|
217 |
+
self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
|
218 |
+
self.mock_count_tokens = self.patcher.start()
|
219 |
+
self.code_chunker = CodeChunker(file_extension='ts')
|
220 |
+
self.mock_codebase = load_json('mock_codefiles.json')
|
221 |
+
|
222 |
+
|
223 |
+
def tearDown(self):
|
224 |
+
self.patcher.stop()
|
225 |
+
|
226 |
+
def test_chunk_typescript_code(self):
|
227 |
+
ts_code = self.mock_codebase['example.ts']
|
228 |
+
chunks = self.code_chunker.chunk(ts_code, token_limit=20)
|
229 |
+
Chunker.print_chunks(chunks)
|
230 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
231 |
+
num_lines = Chunker.count_lines(final_code)
|
232 |
+
self.assertEqual(num_lines, len(ts_code.split("\n")))
|
233 |
+
self.assertIn(ts_code, final_code)
|
234 |
+
self.assertGreater(len(chunks), 1) # Ensure the code is actually chunked
|
235 |
+
|
236 |
+
# Ruby Test Class
|
237 |
+
class TestCodeChunkerRuby(unittest.TestCase):
|
238 |
+
|
239 |
+
def setUp(self):
|
240 |
+
self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
|
241 |
+
self.mock_count_tokens = self.patcher.start()
|
242 |
+
self.code_chunker = CodeChunker(file_extension='rb')
|
243 |
+
self.mock_codebase = load_json('mock_codefiles.json')
|
244 |
+
|
245 |
+
|
246 |
+
def tearDown(self):
|
247 |
+
self.patcher.stop()
|
248 |
+
|
249 |
+
def test_chunk_ruby_code(self):
|
250 |
+
rb_code = self.mock_codebase['example.rb']
|
251 |
+
chunks = self.code_chunker.chunk(rb_code, token_limit=20)
|
252 |
+
Chunker.print_chunks(chunks)
|
253 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
254 |
+
num_lines = Chunker.count_lines(final_code)
|
255 |
+
self.assertEqual(num_lines, len(rb_code.split("\n")))
|
256 |
+
self.assertIn(rb_code, final_code)
|
257 |
+
self.assertGreater(len(chunks), 1) # Ensure the code is actually chunked
|
258 |
+
|
259 |
+
# PHP Test Class
|
260 |
+
class TestCodeChunkerPHP(unittest.TestCase):
|
261 |
+
|
262 |
+
def setUp(self):
|
263 |
+
self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
|
264 |
+
self.mock_count_tokens = self.patcher.start()
|
265 |
+
self.code_chunker = CodeChunker(file_extension='php')
|
266 |
+
self.mock_codebase = load_json('mock_codefiles.json')
|
267 |
+
|
268 |
+
def tearDown(self):
|
269 |
+
self.patcher.stop()
|
270 |
+
|
271 |
+
def test_chunk_php_code(self):
|
272 |
+
php_code = self.mock_codebase['example.php']
|
273 |
+
chunks = self.code_chunker.chunk(php_code, token_limit=20)
|
274 |
+
Chunker.print_chunks(chunks)
|
275 |
+
final_code = Chunker.consolidate_chunks_into_file(chunks)
|
276 |
+
num_lines = Chunker.count_lines(final_code)
|
277 |
+
self.assertEqual(num_lines, len(php_code.split("\n")))
|
278 |
+
self.assertIn(php_code, final_code)
|
279 |
+
self.assertGreater(len(chunks), 1) # Ensure the code is actually chunked
|
280 |
+
|
281 |
if __name__ == '__main__':
|
282 |
unittest.main()
|
utils.py
CHANGED
@@ -7,8 +7,6 @@ def count_tokens(string: str, encoding_name: str) -> int:
|
|
7 |
num_tokens = len(encoding.encode(string))
|
8 |
return num_tokens
|
9 |
|
10 |
-
|
11 |
-
|
12 |
def load_json(json_file):
|
13 |
with open(json_file) as f:
|
14 |
return json.load(f)
|
|
|
7 |
num_tokens = len(encoding.encode(string))
|
8 |
return num_tokens
|
9 |
|
|
|
|
|
10 |
def load_json(json_file):
|
11 |
with open(json_file) as f:
|
12 |
return json.load(f)
|