Spaces:

CintraAI
/

code-chunker

Running

App Files Files Community

CintraAI commited on Feb 27, 2024

Commit

a1983fb

1 Parent(s): 5b1b407

added codefiles

Browse files

Files changed (7) hide show

.vscode/settings.json +11 -0
Chunker.py +124 -0
CodeParser.py +273 -0
mock_codefiles.json +27 -0
requirements.txt +0 -0
test_code_chunker.py +214 -0
utils.py +14 -0

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "python.testing.unittestArgs": [
+        "-v",
+        "-s",
+        ".",
+        "-p",
+        "test_*.py"
+    ],
+    "python.testing.pytestEnabled": false,
+    "python.testing.unittestEnabled": true
+}

Chunker.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from abc import ABC, abstractmethod
+from CodeParser import CodeParser
+from Utils import count_tokens
+class Chunker(ABC):
+    def __init__(self, encoding_name="gpt-4"):
+        self.encoding_name = encoding_name
+    @abstractmethod
+    def chunk(self, content, token_limit):
+        pass
+    @abstractmethod
+    def get_chunk(self, chunked_content, chunk_number):
+        pass
+    @staticmethod
+    def print_chunks(chunks):
+        for chunk_number, chunk_code in chunks.items():
+            print(f"Chunk {chunk_number}:")
+            print("="*40)
+            print(chunk_code)
+            print("="*40)
+    @staticmethod
+    def consolidate_chunks_into_file(chunks):
+        return "\n".join(chunks.values())
+    @staticmethod
+    def count_lines(consolidated_chunks):
+        lines = consolidated_chunks.split("\n")
+        return len(lines)
+class CodeChunker(Chunker):
+    def __init__(self, file_extension, encoding_name="gpt-4"):
+        super().__init__(encoding_name)
+        self.file_extension = file_extension
+    def chunk(self, code, token_limit) -> dict:
+        code_parser = CodeParser(self.file_extension)
+        chunks = {}
+        current_chunk = ""
+        token_count = 0
+        lines = code.split("\n")
+        i = 0
+        chunk_number = 1
+        start_line = 0
+        breakpoints = sorted(code_parser.get_lines_for_points_of_interest(code, self.file_extension))
+        comments = sorted(code_parser.get_lines_for_comments(code, self.file_extension))
+        adjusted_breakpoints = []
+        for bp in breakpoints:
+            current_line = bp - 1
+            highest_comment_line = None  # Initialize with None to indicate no comment line has been found yet
+            while current_line in comments:
+                highest_comment_line = current_line  # Update highest comment line found
+                current_line -= 1  # Move to the previous line
+            if highest_comment_line:  # If a highest comment line exists, add it
+                adjusted_breakpoints.append(highest_comment_line)
+            else:
+                adjusted_breakpoints.append(bp)  # If no comments were found before the breakpoint, add the original breakpoint
+        breakpoints = sorted(set(adjusted_breakpoints))  # Ensure breakpoints are unique and sorted
+        while i < len(lines):
+            line = lines[i]
+            new_token_count = count_tokens(line, self.encoding_name)
+            if token_count + new_token_count > token_limit:
+                # Set the stop line to the last breakpoint before the current line
+                if i in breakpoints:
+                    stop_line = i
+                else:
+                    stop_line = max(max([x for x in breakpoints if x < i], default=start_line), start_line)
+                # If the stop line is the same as the start line, it means we haven't reached a breakpoint yet and we need to move to the next line to find one
+                if stop_line == start_line and i not in breakpoints:
+                    token_count += new_token_count
+                    i += 1
+                # If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
+                elif stop_line == start_line and i == stop_line:
+                    token_count += new_token_count
+                    i += 1
+                # If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
+                elif stop_line == start_line and i in breakpoints:
+                    current_chunk = "\n".join(lines[start_line:stop_line])
+                    if current_chunk.strip():  # If the current chunk is not just whitespace
+                        chunks[chunk_number] = current_chunk  # Using chunk_number as key
+                        chunk_number += 1
+                    token_count = 0
+                    start_line = i
+                    i += 1
+                # If the stop line is different from the start line, it means we're at the end of a block
+                else:
+                    current_chunk = "\n".join(lines[start_line:stop_line])
+                    if current_chunk.strip():
+                        chunks[chunk_number] = current_chunk  # Using chunk_number as key
+                        chunk_number += 1
+                    i = stop_line
+                    token_count = 0
+                    start_line = stop_line
+            else:
+                # If the token count is still within the limit, add the line to the current chunk
+                token_count += new_token_count
+                i += 1
+        # Append remaining code, if any, ensuring it's not empty or whitespace
+        current_chunk_code = "\n".join(lines[start_line:])
+        if current_chunk_code.strip():  # Checks if the chunk is not just whitespace
+            chunks[chunk_number] = current_chunk_code  # Using chunk_number as key
+        return chunks
+    def get_chunk(self, chunked_codebase, chunk_number):
+        return chunked_codebase[chunk_number]

CodeParser.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import os
+import subprocess
+from typing import List, Dict, Union, Tuple
+from tree_sitter import Language, Parser, Node
+from typing import Union, List
+import logging
+class CodeParser:
+    # Added a CACHE_DIR class attribute for caching
+    CACHE_DIR = os.path.expanduser("~/.code_parser_cache")
+    def __init__(self, file_extensions: Union[None, List[str], str] = None):
+        if isinstance(file_extensions, str):
+            file_extensions = [file_extensions]
+        self.language_extension_map = {
+            "py": "python",
+            "js": "javascript",
+            "jsx": "javascript",
+            "css": "css"
+        }
+        if file_extensions is None:
+            self.language_names = []
+        else:
+            self.language_names = [self.language_extension_map.get(ext) for ext in file_extensions if
+                                   ext in self.language_extension_map]
+        self.languages = {}
+        self._install_parsers()
+    def _install_parsers(self):
+        logging.basicConfig(level=logging.INFO)  # Configure logging
+        # Ensure cache directory exists
+        if not os.path.exists(self.CACHE_DIR):
+            os.makedirs(self.CACHE_DIR)
+        # Configure logging to output to the terminal
+        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+        for language in self.language_names:
+            repo_path = os.path.join(self.CACHE_DIR, f"tree-sitter-{language}")
+            if not os.path.exists(repo_path):
+                clone_command = f"git clone https://github.com/tree-sitter/tree-sitter-{language} {repo_path}"
+                result = subprocess.run(
+                    clone_command,
+                    shell=True,
+                    stdout=subprocess.PIPE,  # Capture standard output
+                    stderr=subprocess.PIPE  # Capture standard error
+                )
+                # Check if cloning was successful
+                if result.returncode != 0:
+                    logging.error(
+                        f"Failed to clone repository for {language}. Command: '{clone_command}'. Error: {result.stderr.decode('utf-8')}")
+                    raise Exception(f"Failed to clone repository for {language}")
+            build_path = os.path.join(self.CACHE_DIR, f"build/{language}.so")
+            Language.build_library(build_path, [repo_path])
+            self.languages[language] = Language(build_path, language)
+    def parse_code(self, code: str, file_extension: str) -> Union[None, Node]:
+        language_name = self.language_extension_map.get(file_extension)
+        if language_name is None:
+            print(f"Unsupported file type: {file_extension}")
+            return None
+        language = self.languages.get(language_name)
+        if language is None:
+            print("Language parser not found")
+            return None
+        parser = Parser()
+        parser.set_language(language)
+        tree = parser.parse(bytes(code, "utf8"))
+        if tree is None:
+            print("Failed to parse the code")
+            return None
+        return tree.root_node
+    def extract_points_of_interest(self, node: Node, file_extension: str) -> List[Tuple[Node, str]]:
+        node_types_of_interest = self._get_node_types_of_interest(file_extension)
+        points_of_interest = []
+        if node.type in node_types_of_interest.keys():
+            points_of_interest.append((node, node_types_of_interest[node.type]))
+        for child in node.children:
+            points_of_interest.extend(self.extract_points_of_interest(child, file_extension))
+        return points_of_interest
+    def _get_node_types_of_interest(self, file_extension: str) -> Dict[str, str]:
+        node_types = {
+            'py': {
+                'import_statement': 'Import',
+                'export_statement': 'Export',
+                'class_definition': 'Class',
+                'function_definition': 'Function',
+            },
+            'css': {
+                'tag_name': 'Tag',
+                '@media': 'Media Query',
+            },
+            'js': {
+                'import_statement': 'Import',
+                'export_statement': 'Export',
+                'class_declaration': 'Class',
+                'function_declaration': 'Function',
+                'arrow_function': 'Arrow Function',
+                'statement_block': 'Block',
+            }
+        }
+        if file_extension in node_types.keys():
+            return node_types[file_extension]
+        elif file_extension == "jsx":
+            return node_types["js"]
+        else:
+            raise ValueError("Unsupported file type")
+    def _get_nodes_for_comments(self, file_extension: str) -> Dict[str, str]:
+        node_types = {
+            'py': {
+                'comment': 'Comment',
+                'decorator': 'Decorator',  # Broadened category
+            },
+            'css': {
+                'comment': 'Comment'
+            },
+            'js': {
+                'comment': 'Comment',
+                'decorator': 'Decorator',  # Broadened category
+            }
+        }
+        if file_extension in node_types.keys():
+            return node_types[file_extension]
+        elif file_extension == "jsx":
+            return node_types["js"]
+        else:
+            raise ValueError("Unsupported file type")
+    def extract_comments(self, node: Node, file_extension: str) -> List[Tuple[Node, str]]:
+        node_types_of_interest = self._get_nodes_for_comments(file_extension)
+        comments = []
+        if node.type in node_types_of_interest:
+            comments.append((node, node_types_of_interest[node.type]))
+        for child in node.children:
+            comments.extend(self.extract_comments(child, file_extension))
+        return comments
+    def get_lines_for_points_of_interest(self, code: str, file_extension: str) -> List[int]:
+        language_name = self.language_extension_map.get(file_extension)
+        if language_name is None:
+            raise ValueError("Unsupported file type")
+        language = self.languages.get(language_name)
+        if language is None:
+            raise ValueError("Language parser not found")
+        parser = Parser()
+        parser.set_language(language)
+        tree = parser.parse(bytes(code, "utf8"))
+        root_node = tree.root_node
+        points_of_interest = self.extract_points_of_interest(root_node, file_extension)
+        line_numbers_with_type_of_interest = {}
+        for node, type_of_interest in points_of_interest:
+            start_line = node.start_point[0]
+            if type_of_interest not in line_numbers_with_type_of_interest:
+                line_numbers_with_type_of_interest[type_of_interest] = []
+            if start_line not in line_numbers_with_type_of_interest[type_of_interest]:
+                line_numbers_with_type_of_interest[type_of_interest].append(start_line)
+        lines_of_interest = []
+        for _, line_numbers in line_numbers_with_type_of_interest.items():
+            lines_of_interest.extend(line_numbers)
+        return lines_of_interest
+    def get_lines_for_comments(self, code: str, file_extension: str) -> List[int]:
+        language_name = self.language_extension_map.get(file_extension)
+        if language_name is None:
+            raise ValueError("Unsupported file type")
+        language = self.languages.get(language_name)
+        if language is None:
+            raise ValueError("Language parser not found")
+        parser = Parser()
+        parser.set_language(language)
+        tree = parser.parse(bytes(code, "utf8"))
+        root_node = tree.root_node
+        comments = self.extract_comments(root_node, file_extension)
+        line_numbers_with_comments = {}
+        for node, type_of_interest in comments:
+            start_line = node.start_point[0]
+            if type_of_interest not in line_numbers_with_comments:
+                line_numbers_with_comments[type_of_interest] = []
+            if start_line not in line_numbers_with_comments[type_of_interest]:
+                line_numbers_with_comments[type_of_interest].append(start_line)
+        lines_of_interest = []
+        for _, line_numbers in line_numbers_with_comments.items():
+            lines_of_interest.extend(line_numbers)
+        return lines_of_interest
+    def print_all_line_types(self, code: str, file_extension: str):
+        language_name = self.language_extension_map.get(file_extension)
+        if language_name is None:
+            print(f"Unsupported file type: {file_extension}")
+            return
+        language = self.languages.get(language_name)
+        if language is None:
+            print("Language parser not found")
+            return
+        parser = Parser()
+        parser.set_language(language)
+        tree = parser.parse(bytes(code, "utf8"))
+        root_node = tree.root_node
+        line_to_node_type = self.map_line_to_node_type(root_node)
+        code_lines = code.split('\n')
+        for line_num, node_types in line_to_node_type.items():
+            line_content = code_lines[line_num - 1]  # Adjusting index for zero-based indexing
+            print(f"line {line_num}: {', '.join(node_types)} | Code: {line_content}")
+    def map_line_to_node_type(self, node, line_to_node_type=None, depth=0):
+        if line_to_node_type is None:
+            line_to_node_type = {}
+        start_line = node.start_point[0] + 1  # Tree-sitter lines are 0-indexed; converting to 1-indexed
+        # Only add the node type if it's the start line of the node
+        if start_line not in line_to_node_type:
+            line_to_node_type[start_line] = []
+        line_to_node_type[start_line].append(node.type)
+        for child in node.children:
+            self.map_line_to_node_type(child, line_to_node_type, depth + 1)
+        return line_to_node_type
+    def print_simple_line_numbers_with_code(self, code: str):
+        code_lines = code.split('\n')
+        for i, line in enumerate(code_lines):
+            print(f"Line {i + 1}: {line}")

mock_codefiles.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+    "simple.py": "import sys\n\n# This is a sample Python file\n\ndef main():\n    print('Hello, world!')\n\nif __name__ == '__main__':\n    main()",
+    "text_only.py": "# This file is empty and should test the chunker's ability to handle empty files\n",
+    "routes.py": "from flask import Flask, jsonify, request, redirect, url_for\napp = Flask(__name__)\n\n@app.route('/', methods=['GET'])\ndef home():\n    return '<h1>Welcome to the Home Page</h1>', 200\n\n@authenticate  # Hypothetical decorator for authentication\n@log_access  # Hypothetical decorator for logging access\n@app.route('/api/data', methods=['GET'])\ndef get_data():\n    # Simulate fetching data from a database or external service\n    data = {'key': 'This is some data'}\n    return jsonify(data), 200\n\n@app.route('/api/data/<int:data_id>', methods=['GET'])\ndef get_data_by_id(data_id):\n    # Simulate fetching specific data by ID\n    data = {'id': data_id, 'value': 'Specific data based on ID'}\n    return jsonify(data), 200\n\n@app.route('/api/data', methods=['POST'])\ndef post_data():\n    data = request.json\n    # Simulate saving data to a database\n    return jsonify({'message': 'Data saved successfully', 'data': data}), 201\n\n@app.route('/api/data/<int:data_id>', methods=['PUT'])\ndef update_data(data_id):\n    data = request.json\n    # Simulate updating data in a database\n    return jsonify({'message': 'Data updated successfully', 'id': data_id, 'data': data}), 200\n\n@app.route('/api/data/<int:data_id>', methods=['DELETE'])\ndef delete_data(data_id):\n    # Simulate deleting data by ID\n    return jsonify({'message': 'Data deleted successfully', 'id': data_id}), 200\n\n@app.route('/redirect', methods=['GET'])\ndef example_redirect():\n    return redirect(url_for('home'))\n\nif __name__ == '__main__':\n    app.run(debug=True)",
+    "models.py": "from sqlalchemy import Column, Integer, String, ForeignKey\nfrom sqlalchemy.ext.declarative import declarative_base\nfrom sqlalchemy.orm import relationship\n\nBase = declarative_base()\n\nclass User(Base):\n    __tablename__ = 'users'\n    id = Column(Integer, primary_key=True)\n    username = Column(String, unique=True, nullable=False)\n    email = Column(String, unique=True, nullable=False)\n\n    posts = relationship('Post', backref='author')\n\nclass Post(Base):\n    __tablename__ = 'posts'\n    id = Column(Integer, primary_key=True)\n    title = Column(String, nullable=False)\n    content = Column(String, nullable=False)\n    user_id = Column(Integer, ForeignKey('users.id'))",
+    "big_class.py": "class BigClass:\n    def __init__(self, name, age):\n        self.name = name\n        self.age = age\n\n    def get_name(self):\n        return self.name\n\n    def get_age(self):\n        return self.age\n\n    def set_name(self, name):\n        self.name = name\n\n    def set_age(self, age):\n        self.age = age\n\n    def __str__(self):\n        return f'Name: {self.name}, Age: {self.age}'",
+    "main.py": "from flask import Flask\nfrom routes import app as routes_app\n\n# Create the Flask application\napp = Flask(__name__)\n\n# Register the routes from the routes.py file\napp.register_blueprint(routes_app)\n\n# Configuration settings for the app can go here\napp.config['DEBUG'] = True\napp.config['SECRET_KEY'] = 'your_secret_key'\n\n# More complex app initialization steps can be added here\n# For example, database initialization, login manager setups, etc.\n\n# This function can be used to create a database schema\n def create_database(app):\n    if not path.exists('yourdatabase.db'):\n         db.create_all(app=app)\n         print('Created Database!')\n\nif __name__ == '__main__':\n    # Optionally, call database creation or other setup functions here\n    create_database(app)\n    app.run()",
+    "utilities.py": "import hashlib\nimport uuid\nimport re\nfrom datetime import datetime, timedelta\n\n# Function to hash a password\ndef hash_password(password):\n    salt = uuid.uuid4().hex\n    return hashlib.sha256(salt.encode() + password.encode()).hexdigest() + ':' + salt\n\n# Function to check a hashed password\ndef check_password(hashed_password, user_password):\n    password, salt = hashed_password.split(':')\n    return password == hashlib.sha256(salt.encode() + user_password.encode()).hexdigest()\n\n# Function to validate an email address\ndef validate_email(email):\n    pattern = r\"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$\"\n    return re.match(pattern, email) is not None\n\n# Function to generate a token expiration date\ndef generate_expiration_date(days=1):\n    return datetime.now() + timedelta(days=days)\n\n# Function to convert a string to datetime\ndef string_to_datetime(date_string, format='%Y-%m-%d %H:%M:%S'):\n    return datetime.strptime(date_string, format)\n\n# Function to convert datetime to string\ndef datetime_to_string(date, format='%Y-%m-%d %H:%M:%S'):\n    return date.strftime(format)",
+    "services.py": "import requests\nfrom flask_sqlalchemy import SQLAlchemy\n\n# Assuming an initialized Flask app with SQLAlchemy\ndb = SQLAlchemy()\n\n# Example of a model for database operations\nclass UserData(db.Model):\n    id = db.Column(db.Integer, primary_key=True)\n    name = db.Column(db.String(100), nullable=False)\n    email = db.Column(db.String(100), unique=True, nullable=False)\n\n    def __repr__(self):\n        return f'<User {self.name}>'\n\n# Function to fetch data from an external API\ndef fetch_external_data(api_url):\n    response = requests.get(api_url)\n    if response.status_code == 200:\n        return response.json()\n    else:\n        return {'error': 'Failed to fetch data'}\n\n# Function to save user data to the database\ndef save_user_data(name, email):\n    new_user = UserData(name=name, email=email)\n    db.session.add(new_user)\n    try:\n        db.session.commit()\n        return {'message': 'User saved successfully'}\n    except Exception as e:\n        db.session.rollback()\n        return {'error': str(e)}\n\n# Function to update user data in the database\ndef update_user_data(user_id, name=None, email=None):\n    user = UserData.query.get(user_id)\n    if not user:\n        return {'error': 'User not found'}\n    if name:\n        user.name = name\n    if email:\n        user.email = email\n    try:\n        db.session.commit()\n        return {'message': 'User updated successfully'}\n    except Exception as e:\n        db.session.rollback()\n        return {'error': str(e)}\n\n# Function to delete user data from the database\ndef delete_user_data(user_id):\n    user = UserData.query.get(user_id)\n    if not user:\n        return {'error': 'User not found'}\n    try:\n        db.session.delete(user)\n        db.session.commit()\n        return {'message': 'User deleted successfully'}\n    except Exception as e:\n        db.session.rollback()\n        return {'error': str(e)}",
+    "simple.js": "// This is a sample JavaScript file\n\nfunction main() {\n  console.log('Hello, world!');\n}\n\nmain();",
+    "text_only.js": "// This file is empty and should test the chunker's ability to handle empty files\n",
+    "routes.js": "const express = require('express');\nconst router = express.Router();\n\n// Example of a simple route\nrouter.get('/', (req, res) => {\n  res.send('Welcome to the Home Page');\n});\n\n// Example of a route with a parameter\nrouter.get('/api/data/:data_id', (req, res) => {\n  const dataId = req.params.data_id;\n  res.json({ id: dataId, value: 'Specific data based on ID' });\n});\n\n// Example of a route that handles POST requests\nrouter.post('/api/data', (req, res) => {\n  const data = req.body;\n  res.status(201).json({ message: 'Data saved successfully', data: data });\n});\n\n// Example of a route that handles PUT requests\nrouter.put('/api/data/:data_id', (req, res) => {\n  const dataId = req.params.data_id;\n  const data = req.body;\n  res.json({ message: 'Data updated successfully', id: dataId, data: data });\n});\n\n// Example of a route that handles DELETE requests\nrouter.delete('/api/data/:data_id', (req, res) => {\n  const dataId = req.params.data_id;\n  res.json({ message: 'Data deleted successfully', id: dataId });\n});\n\nmodule.exports = router;",
+    "models.js": "const mongoose = require('mongoose');\n\n// Example of a simple Mongoose model\nconst userSchema = new mongoose.Schema({\n  username: { type: String, required: true, unique: true },\n  email: { type: String, required: true, unique: true }\n});\n\nconst User = mongoose.model('User', userSchema);\n\nmodule.exports = User;",
+    "big_class.js": "// This is a sample JavaScript class with a large number of methods\n\nclass BigClass {\n  constructor(name, age) {\n    this.name = name;\n    this.age = age;\n  }\n\n  getName() {\n    return this.name;\n  }\n\n  getAge() {\n    return this.age;\n  }\n\n  setName(name) {\n    this.name = name;\n  }\n\n  setAge(age) {\n    this.age = age;\n  }\n\n  toString() {\n    return `Name: ${this.name}, Age: ${this.age}`;\n  }\n}\n\nmodule.exports = BigClass;",
+    "main.js": "const express = require('express');\nconst routes = require('./routes');\n\n// Create the Express application\nconst app = express();\n\n// Register the routes from the routes.js file\napp.use('/', routes);\n\n// Configuration settings for the app can go here\napp.set('port', process.env.PORT || 3000);\n\n// More complex app initialization steps can be added here\n// For example, database initialization, middleware setups, etc.\n\n// This function can be used to create a database schema\nfunction createDatabase() {\n  // Code to create database schema\n  console.log('Created Database!');\n}\n\n// Optionally, call database creation or other setup functions here\ncreateDatabase();\n\n// Start the server\napp.listen(app.get('port'), () => {\n  console.log(`Server running on port ${app.get('port')}`);\n});",
+    "utilities.js": "// Example of utility functions for common tasks\n\n// Function to hash a password\nfunction hashPassword(password) {\n  const salt = uuidv4();\n  return sha256(salt + password) + ':' + salt;\n}\n\n// Function to check a hashed password\nfunction checkPassword(hashedPassword, userPassword) {\n  const [password, salt] = hashedPassword.split(':');\n  return sha256(salt + userPassword) === password;\n}\n\n// Function to validate an email address\nfunction validateEmail(email) {\n  const pattern = /^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$/;\n  return pattern.test(email);\n}\n\n// Function to generate a token expiration date\nfunction generateExpirationDate(days = 1) {\n  const expirationDate = new Date();\n  expirationDate.setDate(expirationDate.getDate() + days);\n  return expirationDate;\n}\n\n// Function to convert a string to a Date object\nfunction stringToDate(dateString, format = 'YYYY-MM-DD HH:mm:ss') {\n  return new Date(dateString);\n}\n\n// Function to convert a Date object to a string\nfunction dateToString(date, format = 'YYYY-MM-DD HH:mm:ss') {\n  return date.toISOString();\n}",
+    "services.js": "// Example of service functions for handling data operations\n\n// Function to fetch data from an external API\nasync function fetchExternalData(apiUrl) {\n  try {\n    const response = await fetch(apiUrl);\n    if (response.ok) {\n      return await response.json();\n    } else {\n      return { error: 'Failed to fetch data' };\n    }\n  } catch (error) {\n    return { error: error.message };\n  }\n}\n\n// Function to save user data to the database\nasync function saveUserData(name, email) {\n  try {\n    const newUser = new UserData({ name, email });\n    await newUser.save();\n    return { message: 'User saved successfully' };\n  } catch (error) {\n    return { error: error.message };\n  }\n}\n\n// Function to update user data in the database\nasync function updateUserData(userId, name, email) {\n  try {\n    const user = await UserData.findById(userId);\n    if (!user) {\n      return { error: 'User not found' };\n    }\n    if (name) {\n      user.name = name;\n    }\n    if (email) {\n      user.email = email;\n    }\n    await user.save();\n    return { message: 'User updated successfully' };\n  } catch (error) {\n    return { error: error.message };\n  }\n}\n\n// Function to delete user data from the database\nasync function deleteUserData(userId) {\n  try {\n    const user = await UserData.findById(userId);\n    if (!user) {\n      return { error: 'User not found' };\n    }\n    await user.remove();\n    return { message: 'User deleted successfully' };\n  } catch (error) {\n    return { error: error.message };\n  }\n}",
+    "react_component.js": "import React from 'react';\n\nimport './SearchResults.css';\n\nimport TrackList from '../TrackList/TrackList.js';\n\n constructor(props) {\n        super(props);\n        this.addTopFive = this.addTopFive.bind(this);\n        this.addTopTen = this.addTopTen.bind(this);\n        this.addAll = this.addAll.bind(this);\n    }\n\n    //add the top five tracks to the playlist\n    addTopFive() {\n        this.props.onAdd(this.props.searchResults.slice(0, 5));\n    }\n\n    //add top 10 tracks to the playlist\n    addTopTen() {\n        this.props.onAdd(this.props.searchResults.slice(0, 10));\n    }\n\n    addAll() {\n        this.props.onAdd(this.props.searchResults);\n    }\n    render() {\n    return (\n      <div className=\"SearchResults\">\n        <h2>Results</h2>\n        <TrackList tracks={this.props.searchResults} onAdd={this.props.onAdd} onToggle={this.props.onToggle}  currentTrack={this.props.currentTrack}/>\n      </div>\n    );\n  }\n}\n\nexport default SearchResults;'",
+    "simple_styles.css": "/* Example of CSS styles for a web page */\n\nbody {\n  font-family: Arial, sans-serif;\n  background-color: #f4f4f4;\n  margin: 0;\n  padding: 0;\n}\n\nh1 {\n  color: #333;\n  text-align: center;\n}\n\nbutton {\n  padding: 10px 20px;\n  font-size: 16px;\n  background-color: #007bff;\n  color: #fff;\n  border: none;\n  cursor: pointer;\n}\n\nbutton:hover {\n  background-color: #0056b3;\n}",
+    "media_queries.css": "/* Example of CSS styles with media queries for responsive design */\n\nbody {\n  font-family: Arial, sans-serif;\n  background-color: #f4f4f4;\n  margin: 0;\n  padding: 0;\n}\n\nh1 {\n  color: #333;\n  text-align: center;\n}\n\nbutton {\n  padding: 10px 20px;\n  font-size: 16px;\n  background-color: #007bff;\n  color: #fff;\n  border: none;\n  cursor: pointer;\n}\n\nbutton:hover {\n  background-color: #0056b3;\n}\n\n/* Media query for smaller screens */\n@media (max-width: 768px) {\n  button {\n    padding: 8px 16px;\n    font-size: 14px;\n  }\n}",
+    "single_syntax_error_example.py": "# This is a sample Python file\n\nprint('Hello, world!'\n\n",
+    "multiple_syntax_errors.py": "def calculate_sum(lst):\n    total = 0\n    for num in lst\n        total += num\n    return total\n\nprint(calculate_sum([1, 2, 3, 4])\n\ndef string_manipulator(s):\n    new_string = ''\n    for char in s:\n        if char == 'a':\n        new_string += 'z'\n    else:\n        new_string += char\n    return new_string\n\nprint(string_manipulate('banana'))\n\ndef find_max(numbers):\n    max_num = numbers[0]\n    for num in numbers\n        if num > max_num\n            max_num = num\n    return max_num\n\nprint(find_max([1, 2, 3, 4, 5])",
+    "single_syntax_error_example.js": "//This is a sample JavaScript file\n\nfunction main() {\n  console.log('Hello, world!');\n  if (true) {\n    console.log('hi');\n  \n}\n\nmain();",
+    "multiple_syntax_errors.js": "function calculateSum(arr) {\n  let total = 0;\n  for (let i = 0; i < arr.length; i++ {\n    total += arr[i];\n  }\n  return total;\n}\n\nconsole.log(calculateSum([1, 2, 3, 4);\n\nfunction stringManipulator(str) {\n  let newString = '';\n  for (let i = 0; i < str.length; i++) {\n    if (str.charAt(i) === 'a')\n      newString += 'z';\n    } else {\n      newString += str.charAt(i);\n    }\n  }\n  return newString;\n}\n\nconsole.log(stringManipulator('banana'));\n\nfunction findMax(numbers) {\n  let maxNum = numbers[0];\n  for (let i = 1; i < numbers.length; i++) {\n    if (numbers[i] > maxNum) {\n      maxNum = numbers[i];\n    }\n  }\n  return maxNum;\n}\n\nconsole.log(findMax([1, 2, 3, 4, 5]);",
+    "single_syntax_error_example.css": "\n\nbody {\n  font-family: Arial, sans-serif;\n  background-color: #f4f4f4;\n  margin: 0;\n  padding: 0;\n}\n\nh1 {\n  color: #333;\n  text-align: center;\n}\n\nbutton {\n  padding: 10px 20px;\n  font-size: 16px;\n  background-color: #007bff;\n  color: #fff;\n  border: none;\n  cursor: pointer;\n  :hover {\n  background-color: #0056b3;\n}\n",
+    "multiple_syntax_errors.css": "body {\n  font-family: Arial, sans-serif;\n  background-color: #f4f4f4;\n  margin: 0;\n  padding: 0;\n}\n\nh1 {\n  color: #333;\n  text-align: center;\n}\n\nbutton {\n  padding: 10px 20px;\n  font-size: 16px;\n  background-color: #007bff;\n  color: #fff;\n  border: none;\n  cursor: pointer;\n  :hover {\n  background-color: #0056b3;\n}\n\n/* Media query for smaller screens */\n@media (max-width: 768px) {\n  button {\n    padding: 8px 16px;\n    font-size: 14px;\n  }\n}"
+  }

requirements.txt ADDED Viewed

File without changes

test_code_chunker.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import unittest
+from unittest.mock import patch
+from Chunker import Chunker, CodeChunker
+from utils import load_json
+import tiktoken
+import json
+import os
+# Mocking the count_tokens function as it's external and not the focus of these tests
+def mock_count_tokens(string: str, encoding_name='gpt-4') -> int:
+    """Returns the number of tokens in a text string."""
+    encoding = tiktoken.encoding_for_model(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
+# Python Test Class
+class TestCodeChunkerPython(unittest.TestCase):
+    def setUp(self):
+        self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
+        self.mock_count_tokens = self.patcher.start()
+        self.code_chunker = CodeChunker(file_extension='py')
+        self.mock_codebase = load_json('mock_codefiles.json')
+    def tearDown(self):
+        self.patcher.stop()
+    def test_chunk_simple_code(self):
+        py_code = self.mock_codebase['simple.py']
+        first_chunk_token_limit = mock_count_tokens("import sys")
+        print(f"first_chunk_token_limit = {first_chunk_token_limit}")
+        chunks = self.code_chunker.chunk(py_code, token_limit=25)
+        token_count = self.mock_count_tokens(py_code)
+        print(f"token_count = {token_count}")
+        print(f"original code:\n {py_code}")
+        Chunker.print_chunks(chunks)
+        full_code = Chunker.consolidate_chunks_into_file(chunks)
+        print(f"code after consolidation:\n {full_code}")
+        num_lines = Chunker.count_lines(full_code)
+        self.assertEqual(num_lines, len(py_code.split("\n"))) # The number of lines should be the same
+        self.assertIn(full_code, py_code) # The full code should be in the original code
+        self.assertEqual(len(chunks), 2) # There should be 2 chunks
+        self.assertIn("import sys", chunks[1]) # The first chunk should contain the import statement
+        self.assertIn("print('Hello, world!')", chunks[2]) # The second chunk should contain the print statement
+    def test_chunk_code_text_only(self):
+        py_code = self.mock_codebase['text_only.py']
+        chunks = self.code_chunker.chunk(py_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(py_code.split("\n"))) # The number of lines should be the same
+        self.assertIn(py_code, final_code) # The full code should be in the original code
+        self.assertEqual(len(chunks), 1)
+        self.assertIn("This file is empty and should test the chunker's ability to handle empty files", chunks[1])
+    def test_chunk_code_with_routes(self):
+        py_code = self.mock_codebase['routes.py']
+        chunks = self.code_chunker.chunk(py_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(py_code.split("\n"))) # The number of lines should be the same
+        self.assertIn(py_code, final_code) # The full code should be in the original code
+    def test_chunk_code_with_models(self):
+        py_code = self.mock_codebase['models.py']
+        chunks = self.code_chunker.chunk(py_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(py_code.split("\n")))
+        self.assertIn(py_code, final_code)
+    def test_chunk_code_with_main(self):
+        py_code = self.mock_codebase['main.py']
+        chunks = self.code_chunker.chunk(py_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(py_code.split("\n")))
+        self.assertIn(py_code, final_code)
+    def test_chunk_code_with_utilities(self):
+        py_code = self.mock_codebase['utilities.py']
+        chunks = self.code_chunker.chunk(py_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(py_code.split("\n")))
+        self.assertIn(py_code, final_code)
+    def test_chunk_code_with_big_class(self):
+        py_code = self.mock_codebase['big_class.py']
+        chunks = self.code_chunker.chunk(py_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(py_code.split("\n")))
+        self.assertIn(py_code, final_code)
+# JavaScript Test Class
+class TestCodeChunkerJavaScript(unittest.TestCase):
+    def setUp(self):
+        self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
+        self.mock_count_tokens = self.patcher.start()
+        self.code_chunker = CodeChunker(file_extension='js')
+        self.mock_codebase = load_json('mock_codefiles.json')
+    def tearDown(self):
+        self.patcher.stop()
+    def test_chunk_javascript_simple_code(self):
+        js_code = self.mock_codebase['simple.js']
+        chunks = self.code_chunker.chunk(js_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(js_code.split("\n")))
+        self.assertIn(js_code, final_code)
+    def test_chunk_javascript_with_routes(self):
+        js_code = self.mock_codebase['routes.js']
+        chunks = self.code_chunker.chunk(js_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(js_code.split("\n")))
+        self.assertIn(js_code, final_code)
+    def test_chunk_javascript_with_models(self):
+        js_code = self.mock_codebase['models.js']
+        chunks = self.code_chunker.chunk(js_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(js_code.split("\n")))
+        self.assertIn(js_code, final_code)
+    def test_chunk_javascript_with_main(self):
+        js_code = self.mock_codebase['main.js']
+        chunks = self.code_chunker.chunk(js_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(js_code.split("\n")))
+        self.assertIn(js_code, final_code)
+    def test_chunk_javascript_with_utilities(self):
+        js_code = self.mock_codebase['utilities.js']
+        chunks = self.code_chunker.chunk(js_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(js_code.split("\n")))
+        self.assertIn(js_code, final_code)
+    def test_chunk_javascript_with_big_class(self):
+        js_code = self.mock_codebase['big_class.js']
+        chunks = self.code_chunker.chunk(js_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(js_code.split("\n")))
+        self.assertIn(js_code, final_code)
+    def test_chunk_javascript_with_react_component(self):
+        js_code = self.mock_codebase['react_component.js']
+        chunks = self.code_chunker.chunk(js_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(js_code.split("\n")))
+        self.assertIn(js_code, final_code)
+# CSS Test Class
+class TestCodeChunkerCSS(unittest.TestCase):
+    def setUp(self):
+        self.patcher = patch('app.util.TextChunker.Chunker.count_tokens', side_effect=mock_count_tokens)
+        self.mock_count_tokens = self.patcher.start()
+        self.code_chunker = CodeChunker(file_extension='css')
+         #Load the JSON data
+        self.mock_codebase = load_json('mock_codefiles.json')
+    def tearDown(self):
+        self.patcher.stop()
+    def test_chunk_css_with_media_query(self):
+        css_code = self.mock_codebase['media_queries.css']
+        chunks = self.code_chunker.chunk(css_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(css_code.split("\n")))
+        self.assertIn(css_code, final_code)
+    def test_chunk_css_with_simple_css(self):
+        css_code = self.mock_codebase['simple_styles.css']
+        chunks = self.code_chunker.chunk(css_code, token_limit=20)
+        Chunker.print_chunks(chunks)
+        final_code = Chunker.consolidate_chunks_into_file(chunks)
+        num_lines = Chunker.count_lines(Chunker.consolidate_chunks_into_file(chunks))
+        self.assertEqual(num_lines, len(css_code.split("\n")))
+        self.assertIn(css_code, final_code)
+if __name__ == '__main__':
+    unittest.main()

utils.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import tiktoken
+import json
+def count_tokens(string: str, encoding_name: str) -> int:
+    """Returns the number of tokens in a text string."""
+    encoding = tiktoken.encoding_for_model(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
+def load_json(json_file):
+    with open(json_file) as f:
+        return json.load(f)