File size: 1,481 Bytes
cbbc229
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pytest
import numpy as np

from src import file_readers
import test_config


def test_preprocess_text():
    """
    Tests preprocess function by asserting title,
    shape of corpus, and correct line reading.
    """
    test_path = test_config.data_path / "test.txt"
    processed_path = test_config.data_path / "test_processed.txt"
    with open(test_path, 'r') as file:
        test_corpus = file_readers.preprocess_text(file)
    with open(processed_path, 'r') as process_file:
        processed_corpus = [line.strip() for line in process_file.readlines()]

    assert processed_corpus == test_corpus


def test_read_pdf():
    pdf_path = test_config.data_path / "test.pdf"
    corpus = np.array(file_readers.read_pdf(pdf_path), dtype=object)

    assert np.shape(corpus) == (4, )
    assert np.shape(corpus[0]) == (3, )
    assert corpus[0][0] == 'Lorem Ipsum'
    assert corpus[2][0] == 'Preface'


def test_read_epub():
    """
    Tests read_epub function by asserting title,
    shape of corpus,  and correct line reading.
    """
    ebook_path = test_config.data_path / "test.epub"
    corpus, title = file_readers.read_epub(ebook_path)
    corpus_arr = np.array(corpus, dtype=object)

    assert title == "the_picture_of_dorian_gray"
    assert np.shape(corpus_arr) == (6,)
    assert np.shape(corpus_arr[0]) == (39,)
    assert corpus[0][0] == 'The Project Gutenberg eBook of The Picture of Dorian Gray, by Oscar Wilde'
    assert corpus[2][0] == 'CHAPTER I.'