from typing import List from pytest import fixture from create_db import split_text @fixture def sample_text(): return [ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. " "Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. " "Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", "Another long text string to demonstrate the splitting functionality. This text should also be split into multiple chunks." ] def test_split_text(sample_text): # Split the sample text into chunks chunks = split_text(sample_text) # Assert that the chunks are lists of strings assert all( isinstance(chunk, list) and all( isinstance(text, str) for text in chunk) for chunk in chunks) # Assert that the chunks are not empty assert all(chunk for chunk in chunks) # Assert that the chunks have the expected length (approx. 1500 characters with 150 overlap) expected_length = 1500 - 150 # Subtracting the overlap size assert all(expected_length <= len(''.join(chunk)) < 1500 for chunk in chunks) # Assert that the chunks contain the original text original_text = ' '.join(sample_text) assert all(text in original_text for chunk in chunks for text in chunk) # Assert that the chunks do not overlap (except for the overlap size) for i in range(len(chunks) - 1): previous_chunk = chunks[i] next_chunk = chunks[i + 1] overlap = ''.join(set(previous_chunk[-150:]) & set(next_chunk[:150])) assert len(overlap) == 150 or not overlap