CintraAI commited on
Commit
b7b387e
·
2 Parent(s): a404cdb d3767f0

Add code chunking functionality

Browse files
.github/workflows/main.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push https://jsham042:$HF_TOKEN@huggingface.co/spaces/CintraAI/code-chunker main
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
.idea/code-chunker.iml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="JAVA_MODULE" version="4">
3
+ <component name="NewModuleRootManager" inherit-compiler-output="true">
4
+ <exclude-output />
5
+ <content url="file://$MODULE_DIR$">
6
+ <excludeFolder url="file://$MODULE_DIR$/venv" />
7
+ </content>
8
+ <orderEntry type="inheritedJdk" />
9
+ <orderEntry type="sourceFolder" forTests="false" />
10
+ </component>
11
+ <component name="PackageRequirementsSettings">
12
+ <option name="removeUnused" value="true" />
13
+ <option name="modifyBaseFiles" value="true" />
14
+ </component>
15
+ </module>
.idea/misc.xml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="Black">
4
+ <option name="sdkName" value="Python 3.12 (code-chunker)" />
5
+ </component>
6
+ <component name="ProjectRootManager" version="2" languageLevel="JDK_21" project-jdk-name="Python 3.12 (code-chunker)" project-jdk-type="Python SDK">
7
+ <output url="file://$PROJECT_DIR$/out" />
8
+ </component>
9
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/code-chunker.iml" filepath="$PROJECT_DIR$/.idea/code-chunker.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
+ </project>
Chunker.py CHANGED
@@ -3,7 +3,6 @@ from CodeParser import CodeParser
3
  from utils import count_tokens
4
 
5
 
6
-
7
  class Chunker(ABC):
8
  def __init__(self, encoding_name="gpt-4"):
9
  self.encoding_name = encoding_name
@@ -20,19 +19,20 @@ class Chunker(ABC):
20
  def print_chunks(chunks):
21
  for chunk_number, chunk_code in chunks.items():
22
  print(f"Chunk {chunk_number}:")
23
- print("="*40)
24
  print(chunk_code)
25
- print("="*40)
26
 
27
  @staticmethod
28
  def consolidate_chunks_into_file(chunks):
29
  return "\n".join(chunks.values())
30
-
31
  @staticmethod
32
  def count_lines(consolidated_chunks):
33
  lines = consolidated_chunks.split("\n")
34
  return len(lines)
35
 
 
36
  class CodeChunker(Chunker):
37
  def __init__(self, file_extension, encoding_name="gpt-4"):
38
  super().__init__(encoding_name)
@@ -60,15 +60,16 @@ class CodeChunker(Chunker):
60
  if highest_comment_line: # If a highest comment line exists, add it
61
  adjusted_breakpoints.append(highest_comment_line)
62
  else:
63
- adjusted_breakpoints.append(bp) # If no comments were found before the breakpoint, add the original breakpoint
 
64
 
65
  breakpoints = sorted(set(adjusted_breakpoints)) # Ensure breakpoints are unique and sorted
66
-
67
  while i < len(lines):
68
  line = lines[i]
69
  new_token_count = count_tokens(line, self.encoding_name)
70
  if token_count + new_token_count > token_limit:
71
-
72
  # Set the stop line to the last breakpoint before the current line
73
  if i in breakpoints:
74
  stop_line = i
@@ -79,20 +80,20 @@ class CodeChunker(Chunker):
79
  if stop_line == start_line and i not in breakpoints:
80
  token_count += new_token_count
81
  i += 1
82
-
83
  # If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
84
  elif stop_line == start_line and i == stop_line:
85
  token_count += new_token_count
86
  i += 1
87
-
88
-
89
  # If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
90
  elif stop_line == start_line and i in breakpoints:
91
  current_chunk = "\n".join(lines[start_line:stop_line])
92
  if current_chunk.strip(): # If the current chunk is not just whitespace
93
  chunks[chunk_number] = current_chunk # Using chunk_number as key
94
  chunk_number += 1
95
-
96
  token_count = 0
97
  start_line = i
98
  i += 1
@@ -103,7 +104,7 @@ class CodeChunker(Chunker):
103
  if current_chunk.strip():
104
  chunks[chunk_number] = current_chunk # Using chunk_number as key
105
  chunk_number += 1
106
-
107
  i = stop_line
108
  token_count = 0
109
  start_line = stop_line
@@ -116,9 +117,8 @@ class CodeChunker(Chunker):
116
  current_chunk_code = "\n".join(lines[start_line:])
117
  if current_chunk_code.strip(): # Checks if the chunk is not just whitespace
118
  chunks[chunk_number] = current_chunk_code # Using chunk_number as key
119
-
120
  return chunks
121
 
122
  def get_chunk(self, chunked_codebase, chunk_number):
123
  return chunked_codebase[chunk_number]
124
-
 
3
  from utils import count_tokens
4
 
5
 
 
6
  class Chunker(ABC):
7
  def __init__(self, encoding_name="gpt-4"):
8
  self.encoding_name = encoding_name
 
19
  def print_chunks(chunks):
20
  for chunk_number, chunk_code in chunks.items():
21
  print(f"Chunk {chunk_number}:")
22
+ print("=" * 40)
23
  print(chunk_code)
24
+ print("=" * 40)
25
 
26
  @staticmethod
27
  def consolidate_chunks_into_file(chunks):
28
  return "\n".join(chunks.values())
29
+
30
  @staticmethod
31
  def count_lines(consolidated_chunks):
32
  lines = consolidated_chunks.split("\n")
33
  return len(lines)
34
 
35
+
36
  class CodeChunker(Chunker):
37
  def __init__(self, file_extension, encoding_name="gpt-4"):
38
  super().__init__(encoding_name)
 
60
  if highest_comment_line: # If a highest comment line exists, add it
61
  adjusted_breakpoints.append(highest_comment_line)
62
  else:
63
+ adjusted_breakpoints.append(
64
+ bp) # If no comments were found before the breakpoint, add the original breakpoint
65
 
66
  breakpoints = sorted(set(adjusted_breakpoints)) # Ensure breakpoints are unique and sorted
67
+
68
  while i < len(lines):
69
  line = lines[i]
70
  new_token_count = count_tokens(line, self.encoding_name)
71
  if token_count + new_token_count > token_limit:
72
+
73
  # Set the stop line to the last breakpoint before the current line
74
  if i in breakpoints:
75
  stop_line = i
 
80
  if stop_line == start_line and i not in breakpoints:
81
  token_count += new_token_count
82
  i += 1
83
+
84
  # If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
85
  elif stop_line == start_line and i == stop_line:
86
  token_count += new_token_count
87
  i += 1
88
+
89
+
90
  # If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
91
  elif stop_line == start_line and i in breakpoints:
92
  current_chunk = "\n".join(lines[start_line:stop_line])
93
  if current_chunk.strip(): # If the current chunk is not just whitespace
94
  chunks[chunk_number] = current_chunk # Using chunk_number as key
95
  chunk_number += 1
96
+
97
  token_count = 0
98
  start_line = i
99
  i += 1
 
104
  if current_chunk.strip():
105
  chunks[chunk_number] = current_chunk # Using chunk_number as key
106
  chunk_number += 1
107
+
108
  i = stop_line
109
  token_count = 0
110
  start_line = stop_line
 
117
  current_chunk_code = "\n".join(lines[start_line:])
118
  if current_chunk_code.strip(): # Checks if the chunk is not just whitespace
119
  chunks[chunk_number] = current_chunk_code # Using chunk_number as key
120
+
121
  return chunks
122
 
123
  def get_chunk(self, chunked_codebase, chunk_number):
124
  return chunked_codebase[chunk_number]
 
LICENSE CHANGED
@@ -1,201 +1,21 @@
1
- Apache License
2
- Version 2.0, January 2004
3
- http://www.apache.org/licenses/
4
-
5
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
-
7
- 1. Definitions.
8
-
9
- "License" shall mean the terms and conditions for use, reproduction,
10
- and distribution as defined by Sections 1 through 9 of this document.
11
-
12
- "Licensor" shall mean the copyright owner or entity authorized by
13
- the copyright owner that is granting the License.
14
-
15
- "Legal Entity" shall mean the union of the acting entity and all
16
- other entities that control, are controlled by, or are under common
17
- control with that entity. For the purposes of this definition,
18
- "control" means (i) the power, direct or indirect, to cause the
19
- direction or management of such entity, whether by contract or
20
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
- outstanding shares, or (iii) beneficial ownership of such entity.
22
-
23
- "You" (or "Your") shall mean an individual or Legal Entity
24
- exercising permissions granted by this License.
25
-
26
- "Source" form shall mean the preferred form for making modifications,
27
- including but not limited to software source code, documentation
28
- source, and configuration files.
29
-
30
- "Object" form shall mean any form resulting from mechanical
31
- transformation or translation of a Source form, including but
32
- not limited to compiled object code, generated documentation,
33
- and conversions to other media types.
34
-
35
- "Work" shall mean the work of authorship, whether in Source or
36
- Object form, made available under the License, as indicated by a
37
- copyright notice that is included in or attached to the work
38
- (an example is provided in the Appendix below).
39
-
40
- "Derivative Works" shall mean any work, whether in Source or Object
41
- form, that is based on (or derived from) the Work and for which the
42
- editorial revisions, annotations, elaborations, or other modifications
43
- represent, as a whole, an original work of authorship. For the purposes
44
- of this License, Derivative Works shall not include works that remain
45
- separable from, or merely link (or bind by name) to the interfaces of,
46
- the Work and Derivative Works thereof.
47
-
48
- "Contribution" shall mean any work of authorship, including
49
- the original version of the Work and any modifications or additions
50
- to that Work or Derivative Works thereof, that is intentionally
51
- submitted to Licensor for inclusion in the Work by the copyright owner
52
- or by an individual or Legal Entity authorized to submit on behalf of
53
- the copyright owner. For the purposes of this definition, "submitted"
54
- means any form of electronic, verbal, or written communication sent
55
- to the Licensor or its representatives, including but not limited to
56
- communication on electronic mailing lists, source code control systems,
57
- and issue tracking systems that are managed by, or on behalf of, the
58
- Licensor for the purpose of discussing and improving the Work, but
59
- excluding communication that is conspicuously marked or otherwise
60
- designated in writing by the copyright owner as "Not a Contribution."
61
-
62
- "Contributor" shall mean Licensor and any individual or Legal Entity
63
- on behalf of whom a Contribution has been received by Licensor and
64
- subsequently incorporated within the Work.
65
-
66
- 2. Grant of Copyright License. Subject to the terms and conditions of
67
- this License, each Contributor hereby grants to You a perpetual,
68
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
- copyright license to reproduce, prepare Derivative Works of,
70
- publicly display, publicly perform, sublicense, and distribute the
71
- Work and such Derivative Works in Source or Object form.
72
-
73
- 3. Grant of Patent License. Subject to the terms and conditions of
74
- this License, each Contributor hereby grants to You a perpetual,
75
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
- (except as stated in this section) patent license to make, have made,
77
- use, offer to sell, sell, import, and otherwise transfer the Work,
78
- where such license applies only to those patent claims licensable
79
- by such Contributor that are necessarily infringed by their
80
- Contribution(s) alone or by combination of their Contribution(s)
81
- with the Work to which such Contribution(s) was submitted. If You
82
- institute patent litigation against any entity (including a
83
- cross-claim or counterclaim in a lawsuit) alleging that the Work
84
- or a Contribution incorporated within the Work constitutes direct
85
- or contributory patent infringement, then any patent licenses
86
- granted to You under this License for that Work shall terminate
87
- as of the date such litigation is filed.
88
-
89
- 4. Redistribution. You may reproduce and distribute copies of the
90
- Work or Derivative Works thereof in any medium, with or without
91
- modifications, and in Source or Object form, provided that You
92
- meet the following conditions:
93
-
94
- (a) You must give any other recipients of the Work or
95
- Derivative Works a copy of this License; and
96
-
97
- (b) You must cause any modified files to carry prominent notices
98
- stating that You changed the files; and
99
-
100
- (c) You must retain, in the Source form of any Derivative Works
101
- that You distribute, all copyright, patent, trademark, and
102
- attribution notices from the Source form of the Work,
103
- excluding those notices that do not pertain to any part of
104
- the Derivative Works; and
105
-
106
- (d) If the Work includes a "NOTICE" text file as part of its
107
- distribution, then any Derivative Works that You distribute must
108
- include a readable copy of the attribution notices contained
109
- within such NOTICE file, excluding those notices that do not
110
- pertain to any part of the Derivative Works, in at least one
111
- of the following places: within a NOTICE text file distributed
112
- as part of the Derivative Works; within the Source form or
113
- documentation, if provided along with the Derivative Works; or,
114
- within a display generated by the Derivative Works, if and
115
- wherever such third-party notices normally appear. The contents
116
- of the NOTICE file are for informational purposes only and
117
- do not modify the License. You may add Your own attribution
118
- notices within Derivative Works that You distribute, alongside
119
- or as an addendum to the NOTICE text from the Work, provided
120
- that such additional attribution notices cannot be construed
121
- as modifying the License.
122
-
123
- You may add Your own copyright statement to Your modifications and
124
- may provide additional or different license terms and conditions
125
- for use, reproduction, or distribution of Your modifications, or
126
- for any such Derivative Works as a whole, provided Your use,
127
- reproduction, and distribution of the Work otherwise complies with
128
- the conditions stated in this License.
129
-
130
- 5. Submission of Contributions. Unless You explicitly state otherwise,
131
- any Contribution intentionally submitted for inclusion in the Work
132
- by You to the Licensor shall be under the terms and conditions of
133
- this License, without any additional terms or conditions.
134
- Notwithstanding the above, nothing herein shall supersede or modify
135
- the terms of any separate license agreement you may have executed
136
- with Licensor regarding such Contributions.
137
-
138
- 6. Trademarks. This License does not grant permission to use the trade
139
- names, trademarks, service marks, or product names of the Licensor,
140
- except as required for reasonable and customary use in describing the
141
- origin of the Work and reproducing the content of the NOTICE file.
142
-
143
- 7. Disclaimer of Warranty. Unless required by applicable law or
144
- agreed to in writing, Licensor provides the Work (and each
145
- Contributor provides its Contributions) on an "AS IS" BASIS,
146
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
- implied, including, without limitation, any warranties or conditions
148
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
- PARTICULAR PURPOSE. You are solely responsible for determining the
150
- appropriateness of using or redistributing the Work and assume any
151
- risks associated with Your exercise of permissions under this License.
152
-
153
- 8. Limitation of Liability. In no event and under no legal theory,
154
- whether in tort (including negligence), contract, or otherwise,
155
- unless required by applicable law (such as deliberate and grossly
156
- negligent acts) or agreed to in writing, shall any Contributor be
157
- liable to You for damages, including any direct, indirect, special,
158
- incidental, or consequential damages of any character arising as a
159
- result of this License or out of the use or inability to use the
160
- Work (including but not limited to damages for loss of goodwill,
161
- work stoppage, computer failure or malfunction, or any and all
162
- other commercial damages or losses), even if such Contributor
163
- has been advised of the possibility of such damages.
164
-
165
- 9. Accepting Warranty or Additional Liability. While redistributing
166
- the Work or Derivative Works thereof, You may choose to offer,
167
- and charge a fee for, acceptance of support, warranty, indemnity,
168
- or other liability obligations and/or rights consistent with this
169
- License. However, in accepting such obligations, You may act only
170
- on Your own behalf and on Your sole responsibility, not on behalf
171
- of any other Contributor, and only if You agree to indemnify,
172
- defend, and hold each Contributor harmless for any liability
173
- incurred by, or claims asserted against, such Contributor by reason
174
- of your accepting any such warranty or additional liability.
175
-
176
- END OF TERMS AND CONDITIONS
177
-
178
- APPENDIX: How to apply the Apache License to your work.
179
-
180
- To apply the Apache License to your work, attach the following
181
- boilerplate notice, with the fields enclosed by brackets "[]"
182
- replaced with your own identifying information. (Don't include
183
- the brackets!) The text should be enclosed in the appropriate
184
- comment syntax for the file format. We also recommend that a
185
- file or class name and description of purpose be included on the
186
- same "printed page" as the copyright notice for easier
187
- identification within third-party archives.
188
-
189
- Copyright [yyyy] [name of copyright owner]
190
-
191
- Licensed under the Apache License, Version 2.0 (the "License");
192
- you may not use this file except in compliance with the License.
193
- You may obtain a copy of the License at
194
-
195
- http://www.apache.org/licenses/LICENSE-2.0
196
-
197
- Unless required by applicable law or agreed to in writing, software
198
- distributed under the License is distributed on an "AS IS" BASIS,
199
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
- See the License for the specific language governing permissions and
201
- limitations under the License.
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 CINTRAAI Code Chunker
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,3 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # CintraAI Code Chunker
2
 
3
  Cintra's Code Chunker is a novel open-source tool designed to enhance code readability and maintainability by intelligently chunking code files based on key points of interest. This tool leverages advanced parsing techniques to identify significant elements in your code, such as functions, classes, and comments, to organize your codebase into manageable, easily understandable chunks. It's an invaluable resource for applications such as RAG, code patching, and other use cases.
@@ -7,24 +20,29 @@ Cintra's Code Chunker is a novel open-source tool designed to enhance code reada
7
  - **Intelligent Chunking:** Break down your code files into chunks around key points of interest like function definitions, class declarations, and crucial comments.
8
  - **Customizable Token Limits:** Control the size of each chunk with customizable token limits, ensuring that chunks remain manageable and focused.
9
  - **Support for Multiple Languages:** Initially supporting Python, JavaScript, and CSS, with plans to expand to more programming languages.
 
 
 
 
 
 
10
 
11
  ## Getting Started
12
 
13
  ### Prerequisites
14
 
15
  - Python 3.8+
16
- - OpenAI API key (for token counting features)
17
 
18
  ### Installation
19
 
20
  1. Clone the repository:
21
  ```sh
22
- git clone https://github.com/yourgithubusername/code-chunker-parser.git
23
  ```
24
 
25
  2. Navigate to the project directory
26
  ```sh
27
- pip install -r requirements.txt
28
  ```
29
  4. Install the required dependencies
30
  ```sh
@@ -35,8 +53,6 @@ pip install -r requirements.txt
35
  Use the CodeChunker class to chunk a specific code file. You can specify the file extension and token limit for chunking.
36
  Example:
37
  ```py
38
- from backend.app.util.TextChunker.Chunker import CodeChunker
39
-
40
  chunker = CodeChunker(file_extension='py', encoding_name='gpt-4')
41
  chunks = chunker.chunk(your_code_here, token_limit=1000)
42
  CodeChunker.print_chunks(chunks)
@@ -46,21 +62,26 @@ CodeChunker.print_chunks(chunks)
46
  The CodeParser class allows you to parse code to identify points of interest and comments, which can then be used for chunking or other analysis.
47
  Example:
48
  ```
49
- from backend.app.util.CodeParsing.CodeParser import CodeParser
50
-
51
  parser = CodeParser(['py'])
52
  tree = parser.parse_code(your_code_here, 'py')
53
  points_of_interest = parser.extract_points_of_interest(tree, 'py')
54
  ```
55
 
 
 
 
 
 
 
 
 
56
  ## Contributing
57
  We welcome contributions from the community, whether it's through reporting bugs, submitting feature requests, or sending pull requests. Please check the CONTRIBUTING.md file for more details on how to contribute to the project.
58
 
59
  ## License
60
- This project is licensed under the Apache 2.0 license. See the License file for details
61
 
62
  ## Acknowledgments
63
  - This project utilizes the tree-sitter project for parsing code.
64
  - This also uses tiktoken to count tokens for determining chunk sizes.
65
 
66
-
 
1
+ ---
2
+ title: CintraAI Code Chunker
3
+ emoji: 🧩
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ sdk_version: 1.33.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+
14
  # CintraAI Code Chunker
15
 
16
  Cintra's Code Chunker is a novel open-source tool designed to enhance code readability and maintainability by intelligently chunking code files based on key points of interest. This tool leverages advanced parsing techniques to identify significant elements in your code, such as functions, classes, and comments, to organize your codebase into manageable, easily understandable chunks. It's an invaluable resource for applications such as RAG, code patching, and other use cases.
 
20
  - **Intelligent Chunking:** Break down your code files into chunks around key points of interest like function definitions, class declarations, and crucial comments.
21
  - **Customizable Token Limits:** Control the size of each chunk with customizable token limits, ensuring that chunks remain manageable and focused.
22
  - **Support for Multiple Languages:** Initially supporting Python, JavaScript, and CSS, with plans to expand to more programming languages.
23
+
24
+ ## Try Out Code Chunker!
25
+
26
+ Interested in seeing how it works? Check out our interactive demo on **Hugging Face Spaces**.
27
+
28
+ [**Click here to try it out!**](https://huggingface.co/spaces/CintraAI/code-chunker)
29
 
30
  ## Getting Started
31
 
32
  ### Prerequisites
33
 
34
  - Python 3.8+
 
35
 
36
  ### Installation
37
 
38
  1. Clone the repository:
39
  ```sh
40
+ git clone https://github.com/yourgithubusername/code-chunker.git
41
  ```
42
 
43
  2. Navigate to the project directory
44
  ```sh
45
+ cd code-chunker
46
  ```
47
  4. Install the required dependencies
48
  ```sh
 
53
  Use the CodeChunker class to chunk a specific code file. You can specify the file extension and token limit for chunking.
54
  Example:
55
  ```py
 
 
56
  chunker = CodeChunker(file_extension='py', encoding_name='gpt-4')
57
  chunks = chunker.chunk(your_code_here, token_limit=1000)
58
  CodeChunker.print_chunks(chunks)
 
62
  The CodeParser class allows you to parse code to identify points of interest and comments, which can then be used for chunking or other analysis.
63
  Example:
64
  ```
 
 
65
  parser = CodeParser(['py'])
66
  tree = parser.parse_code(your_code_here, 'py')
67
  points_of_interest = parser.extract_points_of_interest(tree, 'py')
68
  ```
69
 
70
+ 3. Understanding the Token Limit in Chunking:
71
+
72
+ In the `chunk` method of the `Chunker` class, a `token_limit` parameter is used to control the size of each chunk of code. A 'token' can be thought of as the smallest unit of processing. In the context of text processing, a token could be a word, a sentence, or a similar unit.
73
+
74
+ The `token_limit` parameter limits the number of these tokens for each chunk. If the limit is, for instance, 100 tokens, that means each chunk of content produced by the `chunk` method should contain no more than 100 tokens.
75
+
76
+ It is worth noting that the way content is tokenized and how a token is defined depends on the specific implementation and the type of content being processed.
77
+
78
  ## Contributing
79
  We welcome contributions from the community, whether it's through reporting bugs, submitting feature requests, or sending pull requests. Please check the CONTRIBUTING.md file for more details on how to contribute to the project.
80
 
81
  ## License
82
+ This project is licensed under the MIT license. See the License file for details
83
 
84
  ## Acknowledgments
85
  - This project utilizes the tree-sitter project for parsing code.
86
  - This also uses tiktoken to count tokens for determining chunk sizes.
87
 
 
__pycache__/Chunker.cpython-312.pyc CHANGED
Binary files a/__pycache__/Chunker.cpython-312.pyc and b/__pycache__/Chunker.cpython-312.pyc differ
 
__pycache__/test_code_chunker.cpython-312.pyc CHANGED
Binary files a/__pycache__/test_code_chunker.cpython-312.pyc and b/__pycache__/test_code_chunker.cpython-312.pyc differ
 
__pycache__/utils.cpython-312.pyc CHANGED
Binary files a/__pycache__/utils.cpython-312.pyc and b/__pycache__/utils.cpython-312.pyc differ
 
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import json
3
+ import os
4
+ from Chunker import CodeChunker
5
+
6
+ # Set Streamlit page config at the very beginning
7
+ st.set_page_config(page_title="Cintra Code Chunker", layout="wide")
8
+
9
+ # Function to load JSON data
10
+ def load_json_file(file_path):
11
+ with open(file_path, 'r') as file:
12
+ return json.load(file)
13
+
14
+ # Function to read code from an uploaded file
15
+ def read_code_from_file(uploaded_file):
16
+ return uploaded_file.getvalue().decode("utf-8")
17
+
18
+ st.link_button('Contribute on GitHub', 'https://github.com/CintraAI/code-chunker', help=None, type="secondary", disabled=False, use_container_width=False)
19
+
20
+ json_file_path = os.path.join(os.path.dirname(__file__), 'mock_codefiles.json')
21
+ code_files_data = load_json_file(json_file_path)
22
+
23
+ # Extract filenames and contents
24
+ code_files = list(code_files_data.keys())
25
+
26
+ st.title('Cintra Code Chunker')
27
+
28
+ selection_col, upload_col = st.columns(2)
29
+ with selection_col:
30
+ # File selection dropdown
31
+ selected_file_name = st.selectbox("Select an example code file", code_files)
32
+
33
+ with upload_col:
34
+ # File upload
35
+ uploaded_file = st.file_uploader("Or upload your code file", type=['py', 'js', 'css', 'jsx'])
36
+
37
+ # Determine the content and file extension based on selection or upload
38
+ if uploaded_file is not None:
39
+ code_content = read_code_from_file(uploaded_file)
40
+ file_extension = uploaded_file.name.split('.')[-1]
41
+ else:
42
+ code_content = code_files_data.get(selected_file_name, "")
43
+ file_extension = selected_file_name.split('.')[-1] if selected_file_name else None
44
+
45
+ # Determine the language for syntax highlighting
46
+ def get_language_by_extension(file_extension):
47
+ if file_extension in ['py', 'python']:
48
+ return 'python'
49
+ elif file_extension in ['js', 'jsx', 'javascript']:
50
+ return 'javascript'
51
+ elif file_extension == 'css':
52
+ return 'css'
53
+ else:
54
+ return None
55
+
56
+ language = get_language_by_extension(file_extension)
57
+
58
+ st.write("""
59
+ ### Choose Chunk Size Target""")
60
+ token_chunk_size = st.number_input('Target Chunk Size Target', min_value=5, max_value=1000, value=25, help="The token limit guides the chunk size in tokens (tiktoken, gpt-4), aiming for readability without enforcing a strict upper limit.")
61
+
62
+ with st.expander("Learn more about the chunk size target"):
63
+ st.markdown("""
64
+ The `token_limit` parameter in the `chunk` function serves as a guideline to optimize the size of code chunks produced. It is not a hard limit but rather an ideal target, attempting to achieve a balance between chunk size and maintaining logical coherence within the code.
65
+
66
+ - **Adherence to Logical Breakpoints:** The chunking logic respects logical breakpoints in the code, ensuring that chunks are coherent and maintain readability.
67
+ - **Flexibility in Chunk Size:** Chunks might be slightly smaller or larger than the specified `token_limit` to avoid breaking the code in the middle of logical sections.
68
+ - **Handling Final Chunks:** The last chunk of code captures any remaining code, which may vary significantly in size depending on the remaining code's structure.
69
+
70
+ This approach allows for flexibility in how code is segmented into chunks, emphasizing the balance between readable, logical code segments and size constraints.
71
+ """)
72
+
73
+ original_col, chunked_col = st.columns(2)
74
+
75
+ with original_col:
76
+ st.subheader('Original File')
77
+ st.code(code_content, language=language)
78
+
79
+ # Initialize the code chunker
80
+ code_chunker = CodeChunker(file_extension=file_extension)
81
+
82
+ # Chunk the code content
83
+ chunked_code_dict = code_chunker.chunk(code_content, token_chunk_size)
84
+
85
+ with chunked_col:
86
+ st.subheader('Chunked Code')
87
+ for chunk_key, chunk_code in chunked_code_dict.items():
88
+ st.code(chunk_code, language=language)
cintra/documentation.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "Chunker.py": "Description: This file provides functionality for chunking code into manageable segments based on token limits and breakpoints.\n\nDependencies: CodeParser, Utils\n\nFunctions:\n- Name: chunk\n Description: Divides code into chunks based on token limits and breakpoints.\n Parameters: content (str) - The code content to be chunked, token_limit (int) - The maximum number of tokens allowed in a chunk.\n Returns: dict - A dictionary containing the chunked code segments.\n\n- Name: get_chunk\n Description: Retrieves a specific chunk from the chunked codebase.\n Parameters: chunked_content (dict) - The dictionary of chunked code segments, chunk_number (int) - The number of the desired chunk.\n Returns: str - The code segment corresponding to the specified chunk number.\n\n- Name: print_chunks\n Description: Prints out the chunked code segments with a header for each chunk.\n Parameters: chunks (dict) - The dictionary of chunked code segments.\n Returns: None\n\n- Name: consolidate_chunks_into_file\n Description: Combines all chunked code segments into a single string for output.\n Parameters: chunks (dict) - The dictionary of chunked code segments.\n Returns: str - The consolidated code segments as a single string.\n\n- Name: count_lines\n Description: Counts the number of lines in the consolidated code segments.\n Parameters: consolidated_chunks (str) - The consolidated code segments as a single string.\n Returns: int - The total number of lines in the consolidated code.",
3
+ "CodeParser.py": "Description: This file serves as a code parser for various programming languages, allowing for the extraction of points of interest and comments from code files.\n\nDependencies: os, subprocess, typing, tree_sitter, logging\n\nFunctions:\n- Name: \\_\\_init\\_\\_\n Description: Initializes the CodeParser object with the specified file extensions.\n Parameters: file_extensions (Union[None, List[str], str]) - The file extensions to parse.\n Returns: None\n\n- Name: parse_code\n Description: Parses the provided code based on the file extension.\n Parameters: code (str) - The code to parse, file_extension (str) - The extension of the code file.\n Returns: Union[None, Node] - The root node of the parsed code.\n\n- Name: extract_points_of_interest\n Description: Recursively extracts points of interest from the parsed code.\n Parameters: node (Node) - The current node being processed, file_extension (str) - The extension of the code file.\n Returns: List[Tuple[Node, str]] - A list of tuples containing nodes of interest and their types.\n\n- Name: extract_comments\n Description: Recursively extracts comments from the parsed code.\n Parameters: node (Node) - The current node being processed, file_extension (str) - The extension of the code file.\n Returns: List[Tuple[Node, str]] - A list of tuples containing comments and their types.\n\n- Name: get_lines_for_points_of_interest\n Description: Retrieves the line numbers of points of interest in the code.\n Parameters: code (str) - The code to analyze, file_extension (str) - The extension of the code file.\n Returns: List[int] - A list of line numbers with points of interest.\n\n- Name: get_lines_for_comments\n Description: Retrieves the line numbers of comments in the code.\n Parameters: code (str) - The code to analyze, file_extension (str) - The extension of the code file.\n Returns: List[int] - A list of line numbers with comments.\n\n- Name: print_all_line_types\n Description: Prints the line numbers with their corresponding node types.\n Parameters: code (str) - The code to analyze, file_extension (str) - The extension of the code file.\n Returns: None\n\n- Name: map_line_to_node_type\n Description: Maps line numbers to node types recursively.\n Parameters: node (Node) - The current node being processed, line_to_node_type (Dict) - Mapping of line numbers to node types, depth (int) - The depth of recursion.\n Returns: Dict[int, List[str]] - A dictionary mapping line numbers to node types.\n\n- Name: print_simple_line_numbers_with_code\n Description: Prints the line numbers with their corresponding code lines.\n Parameters: code (str) - The code to display.\n Returns: None",
4
+ "app.py": "Description: This file utilizes the Streamlit library to create a simple user interface for selecting a value and displaying its square.\n\nDependencies: streamlit\n\nFunctions:\n- Name: slider\n Description: Creates a slider widget for selecting a numerical value.\n Parameters: None\n Returns: The selected value (int or float)\n\n- Name: write\n Description: Writes the provided values to the user interface.\n Parameters: values (any) - The values to be displayed.\n Returns: None",
5
+ "test_code_chunker.py": "Description: This file contains unit tests for the CodeChunker class, which is responsible for chunking code files based on token limits.\n\nDependencies: unittest, unittest.mock, Chunker, CodeChunker, load_json, tiktoken, json, os\n\nFunctions:\n- Name: mock_count_tokens\n Description: Mocks the count_tokens function to return the number of tokens in a text string.\n Parameters: string (str) - The text string to count tokens for, encoding_name (str) - The name of the encoding model (default: 'gpt-4').\n Returns: int - The number of tokens in the text string.\n\n- Name: TestCodeChunkerPython\n Description: Python test class for testing code chunking functionality.\n Parameters: N/A\n Returns: N/A\n\n- Name: TestCodeChunkerJavaScript\n Description: JavaScript test class for testing code chunking functionality.\n Parameters: N/A\n Returns: N/A\n\n- Name: TestCodeChunkerCSS\n Description: CSS test class for testing code chunking functionality.\n Parameters: N/A\n Returns: N/A\n\nEach test method within the test classes follows a similar structure:\n- Description: Tests chunking code with specific characteristics (e.g., simple code, routes, models, main, utilities, big class, react component, media query, simple styles).\n- Parameters: py_code/js_code/css_code (str) - The code to be chunked, token_limit (int) - The limit of tokens per chunk.\n- Returns: N/A",
6
+ "utils.py": "Description: This file provides functions for tokenizing text strings and loading JSON files.\n\nDependencies: tiktoken, json\n\nFunctions:\n- Name: count_tokens\n Description: Returns the number of tokens in a text string.\n Parameters: string (str) - The text string to tokenize, encoding_name (str) - The name of the encoding model to use.\n Returns: int - The number of tokens in the text string.\n\n- Name: load_json\n Description: Loads and parses a JSON file.\n Parameters: json_file (str) - The path to the JSON file to load.\n Returns: dict - The parsed JSON data from the file."
7
+ }
mock_codefiles.json CHANGED
@@ -1,27 +1,30 @@
1
  {
2
- "simple.py": "import sys\n\n# This is a sample Python file\n\ndef main():\n print('Hello, world!')\n\nif __name__ == '__main__':\n main()",
3
- "text_only.py": "# This file is empty and should test the chunker's ability to handle empty files\n",
4
- "routes.py": "from flask import Flask, jsonify, request, redirect, url_for\napp = Flask(__name__)\n\n@app.route('/', methods=['GET'])\ndef home():\n return '<h1>Welcome to the Home Page</h1>', 200\n\n@authenticate # Hypothetical decorator for authentication\n@log_access # Hypothetical decorator for logging access\n@app.route('/api/data', methods=['GET'])\ndef get_data():\n # Simulate fetching data from a database or external service\n data = {'key': 'This is some data'}\n return jsonify(data), 200\n\n@app.route('/api/data/<int:data_id>', methods=['GET'])\ndef get_data_by_id(data_id):\n # Simulate fetching specific data by ID\n data = {'id': data_id, 'value': 'Specific data based on ID'}\n return jsonify(data), 200\n\n@app.route('/api/data', methods=['POST'])\ndef post_data():\n data = request.json\n # Simulate saving data to a database\n return jsonify({'message': 'Data saved successfully', 'data': data}), 201\n\n@app.route('/api/data/<int:data_id>', methods=['PUT'])\ndef update_data(data_id):\n data = request.json\n # Simulate updating data in a database\n return jsonify({'message': 'Data updated successfully', 'id': data_id, 'data': data}), 200\n\n@app.route('/api/data/<int:data_id>', methods=['DELETE'])\ndef delete_data(data_id):\n # Simulate deleting data by ID\n return jsonify({'message': 'Data deleted successfully', 'id': data_id}), 200\n\n@app.route('/redirect', methods=['GET'])\ndef example_redirect():\n return redirect(url_for('home'))\n\nif __name__ == '__main__':\n app.run(debug=True)",
5
- "models.py": "from sqlalchemy import Column, Integer, String, ForeignKey\nfrom sqlalchemy.ext.declarative import declarative_base\nfrom sqlalchemy.orm import relationship\n\nBase = declarative_base()\n\nclass User(Base):\n __tablename__ = 'users'\n id = Column(Integer, primary_key=True)\n username = Column(String, unique=True, nullable=False)\n email = Column(String, unique=True, nullable=False)\n\n posts = relationship('Post', backref='author')\n\nclass Post(Base):\n __tablename__ = 'posts'\n id = Column(Integer, primary_key=True)\n title = Column(String, nullable=False)\n content = Column(String, nullable=False)\n user_id = Column(Integer, ForeignKey('users.id'))",
6
- "big_class.py": "class BigClass:\n def __init__(self, name, age):\n self.name = name\n self.age = age\n\n def get_name(self):\n return self.name\n\n def get_age(self):\n return self.age\n\n def set_name(self, name):\n self.name = name\n\n def set_age(self, age):\n self.age = age\n\n def __str__(self):\n return f'Name: {self.name}, Age: {self.age}'",
7
- "main.py": "from flask import Flask\nfrom routes import app as routes_app\n\n# Create the Flask application\napp = Flask(__name__)\n\n# Register the routes from the routes.py file\napp.register_blueprint(routes_app)\n\n# Configuration settings for the app can go here\napp.config['DEBUG'] = True\napp.config['SECRET_KEY'] = 'your_secret_key'\n\n# More complex app initialization steps can be added here\n# For example, database initialization, login manager setups, etc.\n\n# This function can be used to create a database schema\n def create_database(app):\n if not path.exists('yourdatabase.db'):\n db.create_all(app=app)\n print('Created Database!')\n\nif __name__ == '__main__':\n # Optionally, call database creation or other setup functions here\n create_database(app)\n app.run()",
8
- "utilities.py": "import hashlib\nimport uuid\nimport re\nfrom datetime import datetime, timedelta\n\n# Function to hash a password\ndef hash_password(password):\n salt = uuid.uuid4().hex\n return hashlib.sha256(salt.encode() + password.encode()).hexdigest() + ':' + salt\n\n# Function to check a hashed password\ndef check_password(hashed_password, user_password):\n password, salt = hashed_password.split(':')\n return password == hashlib.sha256(salt.encode() + user_password.encode()).hexdigest()\n\n# Function to validate an email address\ndef validate_email(email):\n pattern = r\"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$\"\n return re.match(pattern, email) is not None\n\n# Function to generate a token expiration date\ndef generate_expiration_date(days=1):\n return datetime.now() + timedelta(days=days)\n\n# Function to convert a string to datetime\ndef string_to_datetime(date_string, format='%Y-%m-%d %H:%M:%S'):\n return datetime.strptime(date_string, format)\n\n# Function to convert datetime to string\ndef datetime_to_string(date, format='%Y-%m-%d %H:%M:%S'):\n return date.strftime(format)",
9
- "services.py": "import requests\nfrom flask_sqlalchemy import SQLAlchemy\n\n# Assuming an initialized Flask app with SQLAlchemy\ndb = SQLAlchemy()\n\n# Example of a model for database operations\nclass UserData(db.Model):\n id = db.Column(db.Integer, primary_key=True)\n name = db.Column(db.String(100), nullable=False)\n email = db.Column(db.String(100), unique=True, nullable=False)\n\n def __repr__(self):\n return f'<User {self.name}>'\n\n# Function to fetch data from an external API\ndef fetch_external_data(api_url):\n response = requests.get(api_url)\n if response.status_code == 200:\n return response.json()\n else:\n return {'error': 'Failed to fetch data'}\n\n# Function to save user data to the database\ndef save_user_data(name, email):\n new_user = UserData(name=name, email=email)\n db.session.add(new_user)\n try:\n db.session.commit()\n return {'message': 'User saved successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}\n\n# Function to update user data in the database\ndef update_user_data(user_id, name=None, email=None):\n user = UserData.query.get(user_id)\n if not user:\n return {'error': 'User not found'}\n if name:\n user.name = name\n if email:\n user.email = email\n try:\n db.session.commit()\n return {'message': 'User updated successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}\n\n# Function to delete user data from the database\ndef delete_user_data(user_id):\n user = UserData.query.get(user_id)\n if not user:\n return {'error': 'User not found'}\n try:\n db.session.delete(user)\n db.session.commit()\n return {'message': 'User deleted successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}",
10
- "simple.js": "// This is a sample JavaScript file\n\nfunction main() {\n console.log('Hello, world!');\n}\n\nmain();",
11
- "text_only.js": "// This file is empty and should test the chunker's ability to handle empty files\n",
12
- "routes.js": "const express = require('express');\nconst router = express.Router();\n\n// Example of a simple route\nrouter.get('/', (req, res) => {\n res.send('Welcome to the Home Page');\n});\n\n// Example of a route with a parameter\nrouter.get('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n res.json({ id: dataId, value: 'Specific data based on ID' });\n});\n\n// Example of a route that handles POST requests\nrouter.post('/api/data', (req, res) => {\n const data = req.body;\n res.status(201).json({ message: 'Data saved successfully', data: data });\n});\n\n// Example of a route that handles PUT requests\nrouter.put('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n const data = req.body;\n res.json({ message: 'Data updated successfully', id: dataId, data: data });\n});\n\n// Example of a route that handles DELETE requests\nrouter.delete('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n res.json({ message: 'Data deleted successfully', id: dataId });\n});\n\nmodule.exports = router;",
13
- "models.js": "const mongoose = require('mongoose');\n\n// Example of a simple Mongoose model\nconst userSchema = new mongoose.Schema({\n username: { type: String, required: true, unique: true },\n email: { type: String, required: true, unique: true }\n});\n\nconst User = mongoose.model('User', userSchema);\n\nmodule.exports = User;",
14
- "big_class.js": "// This is a sample JavaScript class with a large number of methods\n\nclass BigClass {\n constructor(name, age) {\n this.name = name;\n this.age = age;\n }\n\n getName() {\n return this.name;\n }\n\n getAge() {\n return this.age;\n }\n\n setName(name) {\n this.name = name;\n }\n\n setAge(age) {\n this.age = age;\n }\n\n toString() {\n return `Name: ${this.name}, Age: ${this.age}`;\n }\n}\n\nmodule.exports = BigClass;",
15
- "main.js": "const express = require('express');\nconst routes = require('./routes');\n\n// Create the Express application\nconst app = express();\n\n// Register the routes from the routes.js file\napp.use('/', routes);\n\n// Configuration settings for the app can go here\napp.set('port', process.env.PORT || 3000);\n\n// More complex app initialization steps can be added here\n// For example, database initialization, middleware setups, etc.\n\n// This function can be used to create a database schema\nfunction createDatabase() {\n // Code to create database schema\n console.log('Created Database!');\n}\n\n// Optionally, call database creation or other setup functions here\ncreateDatabase();\n\n// Start the server\napp.listen(app.get('port'), () => {\n console.log(`Server running on port ${app.get('port')}`);\n});",
16
- "utilities.js": "// Example of utility functions for common tasks\n\n// Function to hash a password\nfunction hashPassword(password) {\n const salt = uuidv4();\n return sha256(salt + password) + ':' + salt;\n}\n\n// Function to check a hashed password\nfunction checkPassword(hashedPassword, userPassword) {\n const [password, salt] = hashedPassword.split(':');\n return sha256(salt + userPassword) === password;\n}\n\n// Function to validate an email address\nfunction validateEmail(email) {\n const pattern = /^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$/;\n return pattern.test(email);\n}\n\n// Function to generate a token expiration date\nfunction generateExpirationDate(days = 1) {\n const expirationDate = new Date();\n expirationDate.setDate(expirationDate.getDate() + days);\n return expirationDate;\n}\n\n// Function to convert a string to a Date object\nfunction stringToDate(dateString, format = 'YYYY-MM-DD HH:mm:ss') {\n return new Date(dateString);\n}\n\n// Function to convert a Date object to a string\nfunction dateToString(date, format = 'YYYY-MM-DD HH:mm:ss') {\n return date.toISOString();\n}",
17
- "services.js": "// Example of service functions for handling data operations\n\n// Function to fetch data from an external API\nasync function fetchExternalData(apiUrl) {\n try {\n const response = await fetch(apiUrl);\n if (response.ok) {\n return await response.json();\n } else {\n return { error: 'Failed to fetch data' };\n }\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to save user data to the database\nasync function saveUserData(name, email) {\n try {\n const newUser = new UserData({ name, email });\n await newUser.save();\n return { message: 'User saved successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to update user data in the database\nasync function updateUserData(userId, name, email) {\n try {\n const user = await UserData.findById(userId);\n if (!user) {\n return { error: 'User not found' };\n }\n if (name) {\n user.name = name;\n }\n if (email) {\n user.email = email;\n }\n await user.save();\n return { message: 'User updated successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to delete user data from the database\nasync function deleteUserData(userId) {\n try {\n const user = await UserData.findById(userId);\n if (!user) {\n return { error: 'User not found' };\n }\n await user.remove();\n return { message: 'User deleted successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}",
18
- "react_component.js": "import React from 'react';\n\nimport './SearchResults.css';\n\nimport TrackList from '../TrackList/TrackList.js';\n\n constructor(props) {\n super(props);\n this.addTopFive = this.addTopFive.bind(this);\n this.addTopTen = this.addTopTen.bind(this);\n this.addAll = this.addAll.bind(this);\n }\n\n //add the top five tracks to the playlist\n addTopFive() {\n this.props.onAdd(this.props.searchResults.slice(0, 5));\n }\n\n //add top 10 tracks to the playlist\n addTopTen() {\n this.props.onAdd(this.props.searchResults.slice(0, 10));\n }\n\n addAll() {\n this.props.onAdd(this.props.searchResults);\n }\n render() {\n return (\n <div className=\"SearchResults\">\n <h2>Results</h2>\n <TrackList tracks={this.props.searchResults} onAdd={this.props.onAdd} onToggle={this.props.onToggle} currentTrack={this.props.currentTrack}/>\n </div>\n );\n }\n}\n\nexport default SearchResults;'",
19
- "simple_styles.css": "/* Example of CSS styles for a web page */\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n}\n\nbutton:hover {\n background-color: #0056b3;\n}",
20
- "media_queries.css": "/* Example of CSS styles with media queries for responsive design */\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n}\n\nbutton:hover {\n background-color: #0056b3;\n}\n\n/* Media query for smaller screens */\n@media (max-width: 768px) {\n button {\n padding: 8px 16px;\n font-size: 14px;\n }\n}",
21
- "single_syntax_error_example.py": "# This is a sample Python file\n\nprint('Hello, world!'\n\n",
22
- "multiple_syntax_errors.py": "def calculate_sum(lst):\n total = 0\n for num in lst\n total += num\n return total\n\nprint(calculate_sum([1, 2, 3, 4])\n\ndef string_manipulator(s):\n new_string = ''\n for char in s:\n if char == 'a':\n new_string += 'z'\n else:\n new_string += char\n return new_string\n\nprint(string_manipulate('banana'))\n\ndef find_max(numbers):\n max_num = numbers[0]\n for num in numbers\n if num > max_num\n max_num = num\n return max_num\n\nprint(find_max([1, 2, 3, 4, 5])",
23
- "single_syntax_error_example.js": "//This is a sample JavaScript file\n\nfunction main() {\n console.log('Hello, world!');\n if (true) {\n console.log('hi');\n \n}\n\nmain();",
24
- "multiple_syntax_errors.js": "function calculateSum(arr) {\n let total = 0;\n for (let i = 0; i < arr.length; i++ {\n total += arr[i];\n }\n return total;\n}\n\nconsole.log(calculateSum([1, 2, 3, 4);\n\nfunction stringManipulator(str) {\n let newString = '';\n for (let i = 0; i < str.length; i++) {\n if (str.charAt(i) === 'a')\n newString += 'z';\n } else {\n newString += str.charAt(i);\n }\n }\n return newString;\n}\n\nconsole.log(stringManipulator('banana'));\n\nfunction findMax(numbers) {\n let maxNum = numbers[0];\n for (let i = 1; i < numbers.length; i++) {\n if (numbers[i] > maxNum) {\n maxNum = numbers[i];\n }\n }\n return maxNum;\n}\n\nconsole.log(findMax([1, 2, 3, 4, 5]);",
25
- "single_syntax_error_example.css": "\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n :hover {\n background-color: #0056b3;\n}\n",
26
- "multiple_syntax_errors.css": "body {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n :hover {\n background-color: #0056b3;\n}\n\n/* Media query for smaller screens */\n@media (max-width: 768px) {\n button {\n padding: 8px 16px;\n font-size: 14px;\n }\n}"
27
- }
 
 
 
 
1
  {
2
+ "simple.py": "import sys\n\n# This is a sample Python file\n\ndef main():\n print('Hello, world!')\n\nif __name__ == '__main__':\n main()",
3
+ "text_only.py": "# This file is empty and should test the chunker's ability to handle empty files\n",
4
+ "routes.py": "from flask import Flask, jsonify, request, redirect, url_for\napp = Flask(__name__)\n\n@app.route('/', methods=['GET'])\ndef home():\n return '<h1>Welcome to the Home Page</h1>', 200\n\n@authenticate # Hypothetical decorator for authentication\n@log_access # Hypothetical decorator for logging access\n@app.route('/api/data', methods=['GET'])\ndef get_data():\n # Simulate fetching data from a database or external service\n data = {'key': 'This is some data'}\n return jsonify(data), 200\n\n@app.route('/api/data/<int:data_id>', methods=['GET'])\ndef get_data_by_id(data_id):\n # Simulate fetching specific data by ID\n data = {'id': data_id, 'value': 'Specific data based on ID'}\n return jsonify(data), 200\n\n@app.route('/api/data', methods=['POST'])\ndef post_data():\n data = request.json\n # Simulate saving data to a database\n return jsonify({'message': 'Data saved successfully', 'data': data}), 201\n\n@app.route('/api/data/<int:data_id>', methods=['PUT'])\ndef update_data(data_id):\n data = request.json\n # Simulate updating data in a database\n return jsonify({'message': 'Data updated successfully', 'id': data_id, 'data': data}), 200\n\n@app.route('/api/data/<int:data_id>', methods=['DELETE'])\ndef delete_data(data_id):\n # Simulate deleting data by ID\n return jsonify({'message': 'Data deleted successfully', 'id': data_id}), 200\n\n@app.route('/redirect', methods=['GET'])\ndef example_redirect():\n return redirect(url_for('home'))\n\nif __name__ == '__main__':\n app.run(debug=True)",
5
+ "models.py": "from sqlalchemy import Column, Integer, String, ForeignKey\nfrom sqlalchemy.ext.declarative import declarative_base\nfrom sqlalchemy.orm import relationship\n\nBase = declarative_base()\n\nclass User(Base):\n __tablename__ = 'users'\n id = Column(Integer, primary_key=True)\n username = Column(String, unique=True, nullable=False)\n email = Column(String, unique=True, nullable=False)\n\n posts = relationship('Post', backref='author')\n\nclass Post(Base):\n __tablename__ = 'posts'\n id = Column(Integer, primary_key=True)\n title = Column(String, nullable=False)\n content = Column(String, nullable=False)\n user_id = Column(Integer, ForeignKey('users.id'))",
6
+ "big_class.py": "class BigClass:\n def __init__(self, name, age):\n self.name = name\n self.age = age\n\n def get_name(self):\n return self.name\n\n def get_age(self):\n return self.age\n\n def set_name(self, name):\n self.name = name\n\n def set_age(self, age):\n self.age = age\n\n def __str__(self):\n return f'Name: {self.name}, Age: {self.age}'",
7
+ "main.py": "from flask import Flask\nfrom routes import app as routes_app\n\n# Create the Flask application\napp = Flask(__name__)\n\n# Register the routes from the routes.py file\napp.register_blueprint(routes_app)\n\n# Configuration settings for the app can go here\napp.config['DEBUG'] = True\napp.config['SECRET_KEY'] = 'your_secret_key'\n\n# More complex app initialization steps can be added here\n# For example, database initialization, login manager setups, etc.\n\n# This function can be used to create a database schema\n def create_database(app):\n if not path.exists('yourdatabase.db'):\n db.create_all(app=app)\n print('Created Database!')\n\nif __name__ == '__main__':\n # Optionally, call database creation or other setup functions here\n create_database(app)\n app.run()",
8
+ "utilities.py": "import hashlib\nimport uuid\nimport re\nfrom datetime import datetime, timedelta\n\n# Function to hash a password\ndef hash_password(password):\n salt = uuid.uuid4().hex\n return hashlib.sha256(salt.encode() + password.encode()).hexdigest() + ':' + salt\n\n# Function to check a hashed password\ndef check_password(hashed_password, user_password):\n password, salt = hashed_password.split(':')\n return password == hashlib.sha256(salt.encode() + user_password.encode()).hexdigest()\n\n# Function to validate an email address\ndef validate_email(email):\n pattern = r\"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$\"\n return re.match(pattern, email) is not None\n\n# Function to generate a token expiration date\ndef generate_expiration_date(days=1):\n return datetime.now() + timedelta(days=days)\n\n# Function to convert a string to datetime\ndef string_to_datetime(date_string, format='%Y-%m-%d %H:%M:%S'):\n return datetime.strptime(date_string, format)\n\n# Function to convert datetime to string\ndef datetime_to_string(date, format='%Y-%m-%d %H:%M:%S'):\n return date.strftime(format)",
9
+ "services.py": "import requests\nfrom flask_sqlalchemy import SQLAlchemy\n\n# Assuming an initialized Flask app with SQLAlchemy\ndb = SQLAlchemy()\n\n# Example of a model for database operations\nclass UserData(db.Model):\n id = db.Column(db.Integer, primary_key=True)\n name = db.Column(db.String(100), nullable=False)\n email = db.Column(db.String(100), unique=True, nullable=False)\n\n def __repr__(self):\n return f'<User {self.name}>'\n\n# Function to fetch data from an external API\ndef fetch_external_data(api_url):\n response = requests.get(api_url)\n if response.status_code == 200:\n return response.json()\n else:\n return {'error': 'Failed to fetch data'}\n\n# Function to save user data to the database\ndef save_user_data(name, email):\n new_user = UserData(name=name, email=email)\n db.session.add(new_user)\n try:\n db.session.commit()\n return {'message': 'User saved successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}\n\n# Function to update user data in the database\ndef update_user_data(user_id, name=None, email=None):\n user = UserData.query.get(user_id)\n if not user:\n return {'error': 'User not found'}\n if name:\n user.name = name\n if email:\n user.email = email\n try:\n db.session.commit()\n return {'message': 'User updated successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}\n\n# Function to delete user data from the database\ndef delete_user_data(user_id):\n user = UserData.query.get(user_id)\n if not user:\n return {'error': 'User not found'}\n try:\n db.session.delete(user)\n db.session.commit()\n return {'message': 'User deleted successfully'}\n except Exception as e:\n db.session.rollback()\n return {'error': str(e)}",
10
+ "simple.js": "// This is a sample JavaScript file\n\nfunction main() {\n console.log('Hello, world!');\n}\n\nmain();",
11
+ "text_only.js": "// This file is empty and should test the chunker's ability to handle empty files\n",
12
+ "routes.js": "const express = require('express');\nconst router = express.Router();\n\n// Example of a simple route\nrouter.get('/', (req, res) => {\n res.send('Welcome to the Home Page');\n});\n\n// Example of a route with a parameter\nrouter.get('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n res.json({ id: dataId, value: 'Specific data based on ID' });\n});\n\n// Example of a route that handles POST requests\nrouter.post('/api/data', (req, res) => {\n const data = req.body;\n res.status(201).json({ message: 'Data saved successfully', data: data });\n});\n\n// Example of a route that handles PUT requests\nrouter.put('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n const data = req.body;\n res.json({ message: 'Data updated successfully', id: dataId, data: data });\n});\n\n// Example of a route that handles DELETE requests\nrouter.delete('/api/data/:data_id', (req, res) => {\n const dataId = req.params.data_id;\n res.json({ message: 'Data deleted successfully', id: dataId });\n});\n\nmodule.exports = router;",
13
+ "models.js": "const mongoose = require('mongoose');\n\n// Example of a simple Mongoose model\nconst userSchema = new mongoose.Schema({\n username: { type: String, required: true, unique: true },\n email: { type: String, required: true, unique: true }\n});\n\nconst User = mongoose.model('User', userSchema);\n\nmodule.exports = User;",
14
+ "big_class.js": "// This is a sample JavaScript class with a large number of methods\n\nclass BigClass {\n constructor(name, age) {\n this.name = name;\n this.age = age;\n }\n\n getName() {\n return this.name;\n }\n\n getAge() {\n return this.age;\n }\n\n setName(name) {\n this.name = name;\n }\n\n setAge(age) {\n this.age = age;\n }\n\n toString() {\n return `Name: ${this.name}, Age: ${this.age}`;\n }\n}\n\nmodule.exports = BigClass;",
15
+ "main.js": "const express = require('express');\nconst routes = require('./routes');\n\n// Create the Express application\nconst app = express();\n\n// Register the routes from the routes.js file\napp.use('/', routes);\n\n// Configuration settings for the app can go here\napp.set('port', process.env.PORT || 3000);\n\n// More complex app initialization steps can be added here\n// For example, database initialization, middleware setups, etc.\n\n// This function can be used to create a database schema\nfunction createDatabase() {\n // Code to create database schema\n console.log('Created Database!');\n}\n\n// Optionally, call database creation or other setup functions here\ncreateDatabase();\n\n// Start the server\napp.listen(app.get('port'), () => {\n console.log(`Server running on port ${app.get('port')}`);\n});",
16
+ "utilities.js": "// Example of utility functions for common tasks\n\n// Function to hash a password\nfunction hashPassword(password) {\n const salt = uuidv4();\n return sha256(salt + password) + ':' + salt;\n}\n\n// Function to check a hashed password\nfunction checkPassword(hashedPassword, userPassword) {\n const [password, salt] = hashedPassword.split(':');\n return sha256(salt + userPassword) === password;\n}\n\n// Function to validate an email address\nfunction validateEmail(email) {\n const pattern = /^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$/;\n return pattern.test(email);\n}\n\n// Function to generate a token expiration date\nfunction generateExpirationDate(days = 1) {\n const expirationDate = new Date();\n expirationDate.setDate(expirationDate.getDate() + days);\n return expirationDate;\n}\n\n// Function to convert a string to a Date object\nfunction stringToDate(dateString, format = 'YYYY-MM-DD HH:mm:ss') {\n return new Date(dateString);\n}\n\n// Function to convert a Date object to a string\nfunction dateToString(date, format = 'YYYY-MM-DD HH:mm:ss') {\n return date.toISOString();\n}",
17
+ "services.js": "// Example of service functions for handling data operations\n\n// Function to fetch data from an external API\nasync function fetchExternalData(apiUrl) {\n try {\n const response = await fetch(apiUrl);\n if (response.ok) {\n return await response.json();\n } else {\n return { error: 'Failed to fetch data' };\n }\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to save user data to the database\nasync function saveUserData(name, email) {\n try {\n const newUser = new UserData({ name, email });\n await newUser.save();\n return { message: 'User saved successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to update user data in the database\nasync function updateUserData(userId, name, email) {\n try {\n const user = await UserData.findById(userId);\n if (!user) {\n return { error: 'User not found' };\n }\n if (name) {\n user.name = name;\n }\n if (email) {\n user.email = email;\n }\n await user.save();\n return { message: 'User updated successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}\n\n// Function to delete user data from the database\nasync function deleteUserData(userId) {\n try {\n const user = await UserData.findById(userId);\n if (!user) {\n return { error: 'User not found' };\n }\n await user.remove();\n return { message: 'User deleted successfully' };\n } catch (error) {\n return { error: error.message };\n }\n}",
18
+ "react_component.js": "import React from 'react';\n\nimport './SearchResults.css';\n\nimport TrackList from '../TrackList/TrackList.js';\n\n constructor(props) {\n super(props);\n this.addTopFive = this.addTopFive.bind(this);\n this.addTopTen = this.addTopTen.bind(this);\n this.addAll = this.addAll.bind(this);\n }\n\n //add the top five tracks to the playlist\n addTopFive() {\n this.props.onAdd(this.props.searchResults.slice(0, 5));\n }\n\n //add top 10 tracks to the playlist\n addTopTen() {\n this.props.onAdd(this.props.searchResults.slice(0, 10));\n }\n\n addAll() {\n this.props.onAdd(this.props.searchResults);\n }\n render() {\n return (\n <div className=\"SearchResults\">\n <h2>Results</h2>\n <TrackList tracks={this.props.searchResults} onAdd={this.props.onAdd} onToggle={this.props.onToggle} currentTrack={this.props.currentTrack}/>\n </div>\n );\n }\n}\n\nexport default SearchResults;'",
19
+ "simple_styles.css": "/* Example of CSS styles for a web page */\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n}\n\nbutton:hover {\n background-color: #0056b3;\n}",
20
+ "media_queries.css": "/* Example of CSS styles with media queries for responsive design */\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n}\n\nbutton:hover {\n background-color: #0056b3;\n}\n\n/* Media query for smaller screens */\n@media (max-width: 768px) {\n button {\n padding: 8px 16px;\n font-size: 14px;\n }\n}",
21
+ "single_syntax_error_example.py": "# This is a sample Python file\n\nprint('Hello, world!'\n\n",
22
+ "multiple_syntax_errors.py": "def calculate_sum(lst):\n total = 0\n for num in lst\n total += num\n return total\n\nprint(calculate_sum([1, 2, 3, 4])\n\ndef string_manipulator(s):\n new_string = ''\n for char in s:\n if char == 'a':\n new_string += 'z'\n else:\n new_string += char\n return new_string\n\nprint(string_manipulate('banana'))\n\ndef find_max(numbers):\n max_num = numbers[0]\n for num in numbers\n if num > max_num\n max_num = num\n return max_num\n\nprint(find_max([1, 2, 3, 4, 5])",
23
+ "single_syntax_error_example.js": "//This is a sample JavaScript file\n\nfunction main() {\n console.log('Hello, world!');\n if (true) {\n console.log('hi');\n \n}\n\nmain();",
24
+ "multiple_syntax_errors.js": "function calculateSum(arr) {\n let total = 0;\n for (let i = 0; i < arr.length; i++ {\n total += arr[i];\n }\n return total;\n}\n\nconsole.log(calculateSum([1, 2, 3, 4);\n\nfunction stringManipulator(str) {\n let newString = '';\n for (let i = 0; i < str.length; i++) {\n if (str.charAt(i) === 'a')\n newString += 'z';\n } else {\n newString += str.charAt(i);\n }\n }\n return newString;\n}\n\nconsole.log(stringManipulator('banana'));\n\nfunction findMax(numbers) {\n let maxNum = numbers[0];\n for (let i = 1; i < numbers.length; i++) {\n if (numbers[i] > maxNum) {\n maxNum = numbers[i];\n }\n }\n return maxNum;\n}\n\nconsole.log(findMax([1, 2, 3, 4, 5]);",
25
+ "single_syntax_error_example.css": "\n\nbody {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n :hover {\n background-color: #0056b3;\n}\n",
26
+ "multiple_syntax_errors.css": "body {\n font-family: Arial, sans-serif;\n background-color: #f4f4f4;\n margin: 0;\n padding: 0;\n}\n\nh1 {\n color: #333;\n text-align: center;\n}\n\nbutton {\n padding: 10px 20px;\n font-size: 16px;\n background-color: #007bff;\n color: #fff;\n border: none;\n cursor: pointer;\n :hover {\n background-color: #0056b3;\n}\n\n/* Media query for smaller screens */\n@media (max-width: 768px) {\n button {\n padding: 8px 16px;\n font-size: 14px;\n }\n}",
27
+ "example.ts": "interface User {\n id: number;\n name: string;\n email: string;\n}\n\nclass UserManager {\n private users: User[] = [];\n\n addUser(user: User): void {\n this.users.push(user);\n }\n\n getUser(id: number): User | undefined {\n return this.users.find(user => user.id === id);\n }\n\n updateUser(id: number, updatedUser: Partial<User>): void {\n const userIndex = this.users.findIndex(user => user.id === id);\n if (userIndex !== -1) {\n this.users[userIndex] = { ...this.users[userIndex], ...updatedUser };\n }\n }\n\n deleteUser(id: number): void {\n this.users = this.users.filter(user => user.id !== id);\n }\n}\n\nconst userManager = new UserManager();\nuserManager.addUser({ id: 1, name: 'John Doe', email: 'john@example.com' });\nconsole.log(userManager.getUser(1));",
28
+ "example.rb": "class User\n attr_accessor :id, :name, :email\n\n def initialize(id, name, email)\n @id = id\n @name = name\n @email = email\n end\n\n def to_s\n \"User: #{@name} (#{@email})\"\n end\nend\n\nclass UserManager\n def initialize\n @users = []\n end\n\n def add_user(user)\n @users << user\n end\n\n def get_user(id)\n @users.find { |user| user.id == id }\n end\n\n def update_user(id, updated_user)\n user = get_user(id)\n user.name = updated_user.name if updated_user.name\n user.email = updated_user.email if updated_user.email\n end\n\n def delete_user(id)\n @users.delete_if { |user| user.id == id }\n end\nend\n\nuser_manager = UserManager.new\nuser_manager.add_user(User.new(1, 'John Doe', 'john@example.com'))\nputs user_manager.get_user(1)",
29
+ "example.php": "<?php\n\nclass User {\n public $id;\n public $name;\n public $email;\n\n public function __construct($id, $name, $email) {\n $this->id = $id;\n $this->name = $name;\n $this->email = $email;\n }\n}\n\nclass UserManager {\n private $users = [];\n\n public function addUser($user) {\n $this->users[] = $user;\n }\n\n public function getUser($id) {\n foreach ($this->users as $user) {\n if ($user->id === $id) {\n return $user;\n }\n }\n return null;\n }\n\n public function updateUser($id, $updatedUser) {\n foreach ($this->users as &$user) {\n if ($user->id === $id) {\n $user->name = $updatedUser->name ?? $user->name;\n $user->email = $updatedUser->email ?? $user->email;\n break;\n }\n }\n }\n\n public function deleteUser($id) {\n $this->users = array_filter($this->users, function($user) use ($id) {\n return $user->id !== $id;\n });\n }\n}\n\n$userManager = new UserManager();\n$userManager->addUser(new User(1, 'John Doe', 'john@example.com'));\nvar_dump($userManager->getUser(1));\n?>"
30
+ }
requirements.txt CHANGED
@@ -1,68 +1,46 @@
1
- aiofiles==23.2.1
2
- altair==5.2.0
3
- annotated-types==0.6.0
4
- anyio==4.3.0
5
  attrs==23.2.0
 
 
6
  certifi==2024.2.2
7
  charset-normalizer==3.3.2
8
  click==8.1.7
9
  colorama==0.4.6
10
- contourpy==1.2.0
11
- cycler==0.12.1
12
- fastapi==0.110.0
13
- ffmpy==0.3.2
14
- filelock==3.13.1
15
- fonttools==4.49.0
16
- fsspec==2024.2.0
17
- gradio==4.19.2
18
- gradio_client==0.10.1
19
- h11==0.14.0
20
- httpcore==1.0.4
21
- httpx==0.27.0
22
- huggingface-hub==0.20.3
23
  idna==3.6
24
- importlib_resources==6.1.2
25
  Jinja2==3.1.3
26
  jsonschema==4.21.1
27
  jsonschema-specifications==2023.12.1
28
- kiwisolver==1.4.5
29
  markdown-it-py==3.0.0
30
  MarkupSafe==2.1.5
31
- matplotlib==3.8.3
32
  mdurl==0.1.2
33
  numpy==1.26.4
34
- orjson==3.9.15
35
- packaging==23.2
36
  pandas==2.2.1
37
- pillow==10.2.0
38
- pydantic==2.6.2
39
- pydantic_core==2.16.3
40
- pydub==0.25.1
41
  Pygments==2.17.2
42
- pyparsing==3.1.1
43
- python-dateutil==2.8.2
44
- python-multipart==0.0.9
45
  pytz==2024.1
46
- PyYAML==6.0.1
47
- referencing==0.33.0
48
- regex==2023.12.25
49
  requests==2.31.0
50
- rich==13.7.0
51
  rpds-py==0.18.0
52
- ruff==0.2.2
53
- semantic-version==2.10.0
54
- setuptools==69.1.1
55
- shellingham==1.5.4
56
  six==1.16.0
57
- sniffio==1.3.1
58
- starlette==0.36.3
59
- tomlkit==0.12.0
 
 
 
 
60
  toolz==0.12.1
61
- tqdm==4.66.2
62
- tree-sitter==0.20.4
63
- typer==0.9.0
64
- typing_extensions==4.10.0
65
  tzdata==2024.1
66
  urllib3==2.2.1
67
- uvicorn==0.27.1
68
- websockets==11.0.3
 
1
+ altair==5.3.0
 
 
 
2
  attrs==23.2.0
3
+ blinker==1.7.0
4
+ cachetools==5.3.3
5
  certifi==2024.2.2
6
  charset-normalizer==3.3.2
7
  click==8.1.7
8
  colorama==0.4.6
9
+ gitdb==4.0.11
10
+ GitPython==3.1.43
 
 
 
 
 
 
 
 
 
 
 
11
  idna==3.6
 
12
  Jinja2==3.1.3
13
  jsonschema==4.21.1
14
  jsonschema-specifications==2023.12.1
 
15
  markdown-it-py==3.0.0
16
  MarkupSafe==2.1.5
 
17
  mdurl==0.1.2
18
  numpy==1.26.4
19
+ packaging==24.0
 
20
  pandas==2.2.1
21
+ pillow==10.3.0
22
+ protobuf==4.25.3
23
+ pyarrow==15.0.2
24
+ pydeck==0.8.1b0
25
  Pygments==2.17.2
26
+ python-dateutil==2.9.0.post0
 
 
27
  pytz==2024.1
28
+ referencing==0.34.0
 
 
29
  requests==2.31.0
30
+ rich==13.7.1
31
  rpds-py==0.18.0
 
 
 
 
32
  six==1.16.0
33
+ smmap==5.0.1
34
+ streamlit==1.33.0
35
+ tenacity==8.2.3
36
+ regex==2023.12.25
37
+ tiktoken==0.6.0
38
+ tree-sitter==0.21.3
39
+ toml==0.10.2
40
  toolz==0.12.1
41
+ tornado==6.4
42
+ typing_extensions==4.11.0
 
 
43
  tzdata==2024.1
44
  urllib3==2.2.1
45
+ watchdog==4.0.0
46
+ setuptools==69.2.0
test_code_chunker.py CHANGED
@@ -208,5 +208,75 @@ class TestCodeChunkerCSS(unittest.TestCase):
208
  self.assertEqual(num_lines, len(css_code.split("\n")))
209
  self.assertIn(css_code, final_code)
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  if __name__ == '__main__':
212
  unittest.main()
 
208
  self.assertEqual(num_lines, len(css_code.split("\n")))
209
  self.assertIn(css_code, final_code)
210
 
211
+
212
+
213
+ # TypeScript Test Class
214
+ class TestCodeChunkerTypeScript(unittest.TestCase):
215
+
216
+ def setUp(self):
217
+ self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
218
+ self.mock_count_tokens = self.patcher.start()
219
+ self.code_chunker = CodeChunker(file_extension='ts')
220
+ self.mock_codebase = load_json('mock_codefiles.json')
221
+
222
+
223
+ def tearDown(self):
224
+ self.patcher.stop()
225
+
226
+ def test_chunk_typescript_code(self):
227
+ ts_code = self.mock_codebase['example.ts']
228
+ chunks = self.code_chunker.chunk(ts_code, token_limit=20)
229
+ Chunker.print_chunks(chunks)
230
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
231
+ num_lines = Chunker.count_lines(final_code)
232
+ self.assertEqual(num_lines, len(ts_code.split("\n")))
233
+ self.assertIn(ts_code, final_code)
234
+ self.assertGreater(len(chunks), 1) # Ensure the code is actually chunked
235
+
236
+ # Ruby Test Class
237
+ class TestCodeChunkerRuby(unittest.TestCase):
238
+
239
+ def setUp(self):
240
+ self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
241
+ self.mock_count_tokens = self.patcher.start()
242
+ self.code_chunker = CodeChunker(file_extension='rb')
243
+ self.mock_codebase = load_json('mock_codefiles.json')
244
+
245
+
246
+ def tearDown(self):
247
+ self.patcher.stop()
248
+
249
+ def test_chunk_ruby_code(self):
250
+ rb_code = self.mock_codebase['example.rb']
251
+ chunks = self.code_chunker.chunk(rb_code, token_limit=20)
252
+ Chunker.print_chunks(chunks)
253
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
254
+ num_lines = Chunker.count_lines(final_code)
255
+ self.assertEqual(num_lines, len(rb_code.split("\n")))
256
+ self.assertIn(rb_code, final_code)
257
+ self.assertGreater(len(chunks), 1) # Ensure the code is actually chunked
258
+
259
+ # PHP Test Class
260
+ class TestCodeChunkerPHP(unittest.TestCase):
261
+
262
+ def setUp(self):
263
+ self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
264
+ self.mock_count_tokens = self.patcher.start()
265
+ self.code_chunker = CodeChunker(file_extension='php')
266
+ self.mock_codebase = load_json('mock_codefiles.json')
267
+
268
+ def tearDown(self):
269
+ self.patcher.stop()
270
+
271
+ def test_chunk_php_code(self):
272
+ php_code = self.mock_codebase['example.php']
273
+ chunks = self.code_chunker.chunk(php_code, token_limit=20)
274
+ Chunker.print_chunks(chunks)
275
+ final_code = Chunker.consolidate_chunks_into_file(chunks)
276
+ num_lines = Chunker.count_lines(final_code)
277
+ self.assertEqual(num_lines, len(php_code.split("\n")))
278
+ self.assertIn(php_code, final_code)
279
+ self.assertGreater(len(chunks), 1) # Ensure the code is actually chunked
280
+
281
  if __name__ == '__main__':
282
  unittest.main()
utils.py CHANGED
@@ -7,8 +7,6 @@ def count_tokens(string: str, encoding_name: str) -> int:
7
  num_tokens = len(encoding.encode(string))
8
  return num_tokens
9
 
10
-
11
-
12
  def load_json(json_file):
13
  with open(json_file) as f:
14
  return json.load(f)
 
7
  num_tokens = len(encoding.encode(string))
8
  return num_tokens
9
 
 
 
10
  def load_json(json_file):
11
  with open(json_file) as f:
12
  return json.load(f)