Unit tests for csv feeder + fix some bugs

pull/189/head
Patrick Robertson 2025-02-04 13:37:17 +01:00
rodzic b301f60ea3
commit 78e6418249
4 zmienionych plików z 82 dodań i 4 usunięć

Wyświetl plik

@ -7,16 +7,32 @@ from auto_archiver.utils import url_or_none
class CSVFeeder(Feeder):
column = None
def __iter__(self) -> Metadata:
url_column = self.column or 0
for file in self.files:
with open(file, "r") as f:
reader = csv.reader(f)
first_row = next(reader)
if not(url_or_none(first_row[url_column])):
# it's a header row, skip it
url_column = self.column or 0
if isinstance(url_column, str):
try:
url_column = first_row.index(url_column)
except ValueError:
logger.error(f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?")
return
elif not(url_or_none(first_row[url_column])):
# it's a header row, but we've been given a column number already
logger.debug(f"Skipping header row: {first_row}")
else:
# first row isn't a header row, rewind the file
f.seek(0)
for row in reader:
url = row[0]
if not url_or_none(row[url_column]):
logger.warning(f"Not a valid URL in row: {row}, skipping")
continue
url = row[url_column]
logger.debug(f"Processing {url}")
yield Metadata().set_url(url)

Wyświetl plik

@ -0,0 +1,2 @@
https://example.com/1/,data 1
https://example.com/2/,data 2
1 https://example.com/1/ data 1
2 https://example.com/2/ data 2

Wyświetl plik

@ -0,0 +1,3 @@
webpages,other data
https://example.com/1/,data 1
https://example.com/2/,data 2
1 webpages other data
2 https://example.com/1/ data 1
3 https://example.com/2/ data 2

Wyświetl plik

@ -0,0 +1,57 @@
import pytest
@pytest.fixture
def headerless_csv_file():
return "tests/data/csv_no_headers.csv"
@pytest.fixture
def header_csv_file():
return "tests/data/csv_with_headers.csv"
@pytest.fixture
def header_csv_file_non_default_column():
return "tests/data/csv_with_headers_non_default_column.csv"
def test_csv_feeder_no_headers(headerless_csv_file, setup_module):
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
feeder = setup_module(CSVFeeder, {"files": [headerless_csv_file]})
urls = list(feeder)
assert len(urls) == 2
assert urls[0].get_url() == "https://example.com/1/"
assert urls[1].get_url() == "https://example.com/2/"
def test_csv_feeder_with_headers(header_csv_file, setup_module):
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
feeder = setup_module(CSVFeeder, {"files": [header_csv_file]})
urls = list(feeder)
assert len(urls) == 2
assert urls[0].get_url() == "https://example.com/1/"
assert urls[1].get_url() == "https://example.com/2/"
def test_csv_feeder_wrong_column(header_csv_file, setup_module, caplog):
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
with caplog.at_level("WARNING"):
feeder = setup_module(CSVFeeder, {"files": [header_csv_file], "column": 1})
urls = list(feeder)
assert len(urls) == 0
assert "Not a valid URL in row" in caplog.text
assert len(caplog.records) == 2
def test_csv_feeder_column_by_name(header_csv_file, setup_module):
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
feeder = setup_module(CSVFeeder, {"files": [header_csv_file], "column": "webpages"})
urls = list(feeder)
assert len(urls) == 2
assert urls[0].get_url() == "https://example.com/1/"
assert urls[1].get_url() == "https://example.com/2/"