oceansweep commited on
Commit
7c8d003
1 Parent(s): 47c5e69

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +172 -0
  2. templates/index.html +199 -0
app.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, render_template
2
+ import re
3
+ import logging
4
+ import nltk
5
+ from typing import List
6
+ from langdetect import detect
7
+
8
+ app = Flask(__name__)
9
+
10
+ # Configure logging
11
+ logging.basicConfig(level=logging.DEBUG)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Download necessary NLTK data
15
+ nltk.download('punkt', quiet=True)
16
+
17
+ # Default text (you can add more default texts as needed)
18
+ default_prose = """One of the most important things I didn't understand about the world when I was a child is the degree to which the returns for performance are superlinear.
19
+
20
+ Teachers and coaches implicitly told us the returns were linear. "You get out," I heard a thousand times, "what you put in." They meant well, but this is rarely true. If your product is only half as good as your competitor's, you don't get half as many customers. You get no customers, and you go out of business.
21
+
22
+ It's obviously true that the returns for performance are superlinear in business. Some think this is a flaw of capitalism, and that if we changed the rules it would stop being true. But superlinear returns for performance are a feature of the world, not an artifact of rules we've invented. We see the same pattern in fame, power, military victories, knowledge, and even benefit to humanity. In all of these, the rich get richer. [1]
23
+
24
+ You can't understand the world without understanding the concept of superlinear returns. And if you're ambitious you definitely should, because this will be the wave you surf on.
25
+
26
+ It may seem as if there are a lot of different situations with superlinear returns, but as far as I can tell they reduce to two fundamental causes: exponential growth and thresholds.
27
+
28
+ The most obvious case of superlinear returns is when you're working on something that grows exponentially. For example, growing bacterial cultures. When they grow at all, they grow exponentially. But they're tricky to grow. Which means the difference in outcome between someone who's adept at it and someone who's not is very great.
29
+
30
+ Startups can also grow exponentially, and we see the same pattern there. Some manage to achieve high growth rates. Most don't. And as a result you get qualitatively different outcomes: the companies with high growth rates tend to become immensely valuable, while the ones with lower growth rates may not even survive.
31
+
32
+ Y Combinator encourages founders to focus on growth rate rather than absolute numbers. It prevents them from being discouraged early on, when the absolute numbers are still low. It also helps them decide what to focus on: you can use growth rate as a compass to tell you how to evolve the company. But the main advantage is that by focusing on growth rate you tend to get something that grows exponentially.
33
+
34
+ YC doesn't explicitly tell founders that with growth rate "you get out what you put in," but it's not far from the truth. And if growth rate were proportional to performance, then the reward for performance p over time t would be proportional to pt.
35
+
36
+ Even after decades of thinking about this, I find that sentence startling."""
37
+
38
+ def detect_language(text: str) -> str:
39
+ return detect(text)
40
+
41
+ def post_process_chunks(chunks: List[str]) -> List[str]:
42
+ # Implement any post-processing logic here if needed
43
+ return chunks
44
+
45
+ def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0, language: str = None) -> List[str]:
46
+ logging.debug("chunk_text_by_words...")
47
+ if language is None:
48
+ language = detect_language(text)
49
+
50
+ if language.startswith('zh'): # Chinese
51
+ import jieba
52
+ words = list(jieba.cut(text))
53
+ elif language == 'ja': # Japanese
54
+ import fugashi
55
+ tagger = fugashi.Tagger()
56
+ words = [word.surface for word in tagger(text)]
57
+ else: # Default to simple splitting for other languages
58
+ words = text.split()
59
+
60
+ chunks = []
61
+ for i in range(0, len(words), max_words - overlap):
62
+ chunk = ' '.join(words[i:i + max_words])
63
+ chunks.append(chunk)
64
+ return post_process_chunks(chunks)
65
+
66
+ def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = None) -> List[str]:
67
+ logging.debug("chunk_text_by_sentences...")
68
+ if language is None:
69
+ language = detect_language(text)
70
+
71
+ if language.startswith('zh'): # Chinese
72
+ import jieba
73
+ sentences = list(jieba.cut(text, cut_all=False))
74
+ elif language == 'ja': # Japanese
75
+ import fugashi
76
+ tagger = fugashi.Tagger()
77
+ sentences = [word.surface for word in tagger(text) if word.feature.pos1 in ['記号', '補助記号'] and word.surface.strip()]
78
+ else: # Default to NLTK for other languages
79
+ sentences = nltk.sent_tokenize(text)
80
+
81
+ chunks = []
82
+ for i in range(0, len(sentences), max_sentences - overlap):
83
+ chunk = ' '.join(sentences[i:i + max_sentences])
84
+ chunks.append(chunk)
85
+ return post_process_chunks(chunks)
86
+
87
+ def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]:
88
+ logging.debug("chunk_text_by_paragraphs...")
89
+ paragraphs = re.split(r'\n\s*\n', text)
90
+ chunks = []
91
+ for i in range(0, len(paragraphs), max_paragraphs - overlap):
92
+ chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs])
93
+ chunks.append(chunk)
94
+ return post_process_chunks(chunks)
95
+
96
+ def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
97
+ logging.debug("chunk_text_by_tokens...")
98
+ words = text.split()
99
+ chunks = []
100
+ current_chunk = []
101
+ current_token_count = 0
102
+
103
+ for word in words:
104
+ word_token_count = len(word) // 4 + 1 # Rough estimate of token count
105
+ if current_token_count + word_token_count > max_tokens and current_chunk:
106
+ chunks.append(' '.join(current_chunk))
107
+ current_chunk = current_chunk[-overlap:] if overlap > 0 else []
108
+ current_token_count = sum(len(w) // 4 + 1 for w in current_chunk)
109
+
110
+ current_chunk.append(word)
111
+ current_token_count += word_token_count
112
+
113
+ if current_chunk:
114
+ chunks.append(' '.join(current_chunk))
115
+
116
+ return post_process_chunks(chunks)
117
+
118
+ @app.route('/')
119
+ def index():
120
+ return render_template('index.html')
121
+
122
+ @app.route('/chunk', methods=['POST'])
123
+ def chunk_text():
124
+ data = request.json
125
+ logger.debug(f"Received data: {data}")
126
+ text = data.get('text', default_prose)
127
+ chunk_size = data.get('chunkSize', 300)
128
+ overlap = data.get('overlap', 0)
129
+ splitter_type = data.get('splitter', 'words')
130
+
131
+ logger.debug(f"Chunking with: splitter={splitter_type}, chunk_size={chunk_size}, overlap={overlap}")
132
+
133
+ if splitter_type == 'words':
134
+ chunks = chunk_text_by_words(text, chunk_size, overlap)
135
+ elif splitter_type == 'sentences':
136
+ chunks = chunk_text_by_sentences(text, chunk_size, overlap)
137
+ elif splitter_type == 'paragraphs':
138
+ chunks = chunk_text_by_paragraphs(text, chunk_size, overlap)
139
+ elif splitter_type == 'tokens':
140
+ chunks = chunk_text_by_tokens(text, chunk_size, overlap)
141
+ else:
142
+ return jsonify({'error': 'Invalid splitter type'}), 400
143
+
144
+ logger.debug(f"Number of chunks created: {len(chunks)}")
145
+
146
+ # Process chunks to include start and end indices
147
+ processed_chunks = []
148
+ current_index = 0
149
+ for chunk in chunks:
150
+ chunk_length = len(chunk)
151
+ end_index = current_index + chunk_length
152
+ processed_chunks.append({
153
+ 'text': chunk,
154
+ 'startIndex': current_index,
155
+ 'endIndex': end_index,
156
+ 'overlapWithNext': overlap if end_index < len(text) else 0
157
+ })
158
+ current_index = end_index - overlap
159
+
160
+ logger.debug(f"Processed chunks: {processed_chunks}")
161
+
162
+ response = {
163
+ 'chunks': processed_chunks,
164
+ 'totalCharacters': len(text),
165
+ 'numberOfChunks': len(chunks),
166
+ 'averageChunkSize': sum(len(chunk) for chunk in chunks) / len(chunks) if chunks else 0
167
+ }
168
+ logger.debug(f"Sending response: {response}")
169
+ return jsonify(response)
170
+
171
+ if __name__ == '__main__':
172
+ app.run(debug=True)
templates/index.html ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>ChunkViz Python</title>
7
+ <style>
8
+ .App {
9
+ text-align: center;
10
+ margin: 20px;
11
+ }
12
+ .chunked-text {
13
+ white-space: pre-wrap;
14
+ font-family: monospace;
15
+ width: 80%;
16
+ margin: 0 auto;
17
+ text-align: left;
18
+ padding-top: 30px;
19
+ }
20
+ .overlap {
21
+ background-color: #90a955;
22
+ }
23
+ textarea {
24
+ width: 80%;
25
+ margin-bottom: 10px;
26
+ }
27
+ .control-group {
28
+ margin-bottom: 10px;
29
+ }
30
+ #stats {
31
+ margin-top: 20px;
32
+ font-weight: bold;
33
+ }
34
+ .chunk {
35
+ display: inline;
36
+ }
37
+ </style>
38
+ </head>
39
+ <body>
40
+ <div class="App">
41
+ <h1>ChunkViz Python v0.2</h1>
42
+ <p>Language Models do better when they're focused.</p>
43
+ <p>One strategy is to pass a relevant subset (chunk) of your full data. There are many ways to chunk text.</p>
44
+ <p>This is a tool to understand different chunking/splitting strategies.</p>
45
+
46
+ <textarea id="textInput" rows="10"></textarea>
47
+
48
+ <div class="control-group">
49
+ <label for="splitterSelect">Splitter:</label>
50
+ <select id="splitterSelect">
51
+ <option value="words">Words</option>
52
+ <option value="sentences">Sentences</option>
53
+ <option value="paragraphs">Paragraphs</option>
54
+ <option value="tokens">Tokens</option>
55
+ </select>
56
+ </div>
57
+
58
+ <div class="control-group">
59
+ <label for="chunkSize">Chunk Size:</label>
60
+ <input type="number" id="chunkSize" min="1" max="2000" value="300">
61
+ <input type="range" id="chunkSizeRange" min="1" max="2000" value="300">
62
+ </div>
63
+
64
+ <div class="control-group">
65
+ <label for="overlap">Chunk Overlap:</label>
66
+ <input type="number" id="overlap" min="0" max="1000" value="0">
67
+ <input type="range" id="overlapRange" min="0" max="1000" value="0">
68
+ </div>
69
+
70
+ <div id="stats"></div>
71
+
72
+ <div id="chunkedText" class="chunked-text"></div>
73
+ </div>
74
+
75
+ <script>
76
+ document.addEventListener('DOMContentLoaded', function() {
77
+ const textInput = document.getElementById('textInput');
78
+ const splitterSelect = document.getElementById('splitterSelect');
79
+ const chunkSize = document.getElementById('chunkSize');
80
+ const chunkSizeRange = document.getElementById('chunkSizeRange');
81
+ const overlap = document.getElementById('overlap');
82
+ const overlapRange = document.getElementById('overlapRange');
83
+ const stats = document.getElementById('stats');
84
+ const chunkedText = document.getElementById('chunkedText');
85
+
86
+ // Set default text
87
+ textInput.value = `One of the most important things I didn't understand about the world when I was a child is the degree to which the returns for performance are superlinear.
88
+
89
+ Teachers and coaches implicitly told us the returns were linear. "You get out," I heard a thousand times, "what you put in." They meant well, but this is rarely true. If your product is only half as good as your competitor's, you don't get half as many customers. You get no customers, and you go out of business.
90
+
91
+ It's obviously true that the returns for performance are superlinear in business. Some think this is a flaw of capitalism, and that if we changed the rules it would stop being true. But superlinear returns for performance are a feature of the world, not an artifact of rules we've invented. We see the same pattern in fame, power, military victories, knowledge, and even benefit to humanity. In all of these, the rich get richer. [1]
92
+
93
+ You can't understand the world without understanding the concept of superlinear returns. And if you're ambitious you definitely should, because this will be the wave you surf on.
94
+
95
+ It may seem as if there are a lot of different situations with superlinear returns, but as far as I can tell they reduce to two fundamental causes: exponential growth and thresholds.
96
+
97
+ The most obvious case of superlinear returns is when you're working on something that grows exponentially. For example, growing bacterial cultures. When they grow at all, they grow exponentially. But they're tricky to grow. Which means the difference in outcome between someone who's adept at it and someone who's not is very great.
98
+
99
+ Startups can also grow exponentially, and we see the same pattern there. Some manage to achieve high growth rates. Most don't. And as a result you get qualitatively different outcomes: the companies with high growth rates tend to become immensely valuable, while the ones with lower growth rates may not even survive.
100
+
101
+ Y Combinator encourages founders to focus on growth rate rather than absolute numbers. It prevents them from being discouraged early on, when the absolute numbers are still low. It also helps them decide what to focus on: you can use growth rate as a compass to tell you how to evolve the company. But the main advantage is that by focusing on growth rate you tend to get something that grows exponentially.
102
+
103
+ YC doesn't explicitly tell founders that with growth rate "you get out what you put in," but it's not far from the truth. And if growth rate were proportional to performance, then the reward for performance p over time t would be proportional to pt.
104
+
105
+ Even after decades of thinking about this, I find that sentence startling.`;
106
+
107
+ function updateChunks() {
108
+ console.log("Updating chunks...");
109
+ fetch('/chunk', {
110
+ method: 'POST',
111
+ headers: {
112
+ 'Content-Type': 'application/json',
113
+ },
114
+ body: JSON.stringify({
115
+ text: textInput.value,
116
+ chunkSize: parseInt(chunkSize.value),
117
+ overlap: parseInt(overlap.value),
118
+ splitter: splitterSelect.value
119
+ }),
120
+ })
121
+ .then(response => response.json())
122
+ .then(data => {
123
+ console.log("Received data:", data);
124
+ // Update stats
125
+ stats.innerHTML = `
126
+ <div>Total Characters: ${data.totalCharacters}</div>
127
+ <div>Number of chunks: ${data.numberOfChunks}</div>
128
+ <div>Average chunk size: ${data.averageChunkSize.toFixed(1)}</div>
129
+ `;
130
+
131
+ // Update chunked text display
132
+ chunkedText.innerHTML = highlightChunks(data.chunks, textInput.value);
133
+ })
134
+ .catch(error => {
135
+ console.error("Error:", error);
136
+ });
137
+ }
138
+
139
+ function highlightChunks(chunks, originalText) {
140
+ console.log("Highlighting chunks:", chunks);
141
+ const colors = ['#70d6ff', '#e9ff70', '#ff9770', '#ffd670', '#ff70a6'];
142
+ let highlightedText = '';
143
+ let lastEnd = 0;
144
+
145
+ chunks.forEach((chunk, index) => {
146
+ const color = colors[index % colors.length];
147
+
148
+ // Add any text between chunks
149
+ if (chunk.startIndex > lastEnd) {
150
+ highlightedText += originalText.slice(lastEnd, chunk.startIndex);
151
+ }
152
+
153
+ // Add the chunk
154
+ highlightedText += `<span class="chunk" style="background-color: ${color}">${chunk.text}</span>`;
155
+
156
+ // Add overlap
157
+ if (chunk.overlapWithNext > 0) {
158
+ const overlapText = chunk.text.slice(-chunk.overlapWithNext);
159
+ highlightedText += `<span class="overlap">${overlapText}</span>`;
160
+ }
161
+
162
+ lastEnd = chunk.endIndex;
163
+ });
164
+
165
+ // Add any remaining text
166
+ if (lastEnd < originalText.length) {
167
+ highlightedText += originalText.slice(lastEnd);
168
+ }
169
+
170
+ console.log("Highlighted text:", highlightedText);
171
+ return highlightedText;
172
+ }
173
+
174
+ // Event listeners
175
+ textInput.addEventListener('input', updateChunks);
176
+ splitterSelect.addEventListener('change', updateChunks);
177
+ chunkSize.addEventListener('input', function() {
178
+ chunkSizeRange.value = this.value;
179
+ updateChunks();
180
+ });
181
+ chunkSizeRange.addEventListener('input', function() {
182
+ chunkSize.value = this.value;
183
+ updateChunks();
184
+ });
185
+ overlap.addEventListener('input', function() {
186
+ overlapRange.value = this.value;
187
+ updateChunks();
188
+ });
189
+ overlapRange.addEventListener('input', function() {
190
+ overlap.value = this.value;
191
+ updateChunks();
192
+ });
193
+
194
+ // Initial update
195
+ updateChunks();
196
+ });
197
+ </script>
198
+ </body>
199
+ </html>