kopia lustrzana https://github.com/bellingcat/auto-archiver
Unit tests for csv feeder + fix some bugs
rodzic
b301f60ea3
commit
78e6418249
|
@ -7,16 +7,32 @@ from auto_archiver.utils import url_or_none
|
||||||
|
|
||||||
class CSVFeeder(Feeder):
|
class CSVFeeder(Feeder):
|
||||||
|
|
||||||
|
column = None
|
||||||
|
|
||||||
|
|
||||||
def __iter__(self) -> Metadata:
|
def __iter__(self) -> Metadata:
|
||||||
url_column = self.column or 0
|
|
||||||
for file in self.files:
|
for file in self.files:
|
||||||
with open(file, "r") as f:
|
with open(file, "r") as f:
|
||||||
reader = csv.reader(f)
|
reader = csv.reader(f)
|
||||||
first_row = next(reader)
|
first_row = next(reader)
|
||||||
if not(url_or_none(first_row[url_column])):
|
url_column = self.column or 0
|
||||||
# it's a header row, skip it
|
if isinstance(url_column, str):
|
||||||
|
try:
|
||||||
|
url_column = first_row.index(url_column)
|
||||||
|
except ValueError:
|
||||||
|
logger.error(f"Column {url_column} not found in header row: {first_row}. Did you set the 'column' config correctly?")
|
||||||
|
return
|
||||||
|
elif not(url_or_none(first_row[url_column])):
|
||||||
|
# it's a header row, but we've been given a column number already
|
||||||
logger.debug(f"Skipping header row: {first_row}")
|
logger.debug(f"Skipping header row: {first_row}")
|
||||||
|
else:
|
||||||
|
# first row isn't a header row, rewind the file
|
||||||
|
f.seek(0)
|
||||||
|
|
||||||
for row in reader:
|
for row in reader:
|
||||||
url = row[0]
|
if not url_or_none(row[url_column]):
|
||||||
|
logger.warning(f"Not a valid URL in row: {row}, skipping")
|
||||||
|
continue
|
||||||
|
url = row[url_column]
|
||||||
logger.debug(f"Processing {url}")
|
logger.debug(f"Processing {url}")
|
||||||
yield Metadata().set_url(url)
|
yield Metadata().set_url(url)
|
|
@ -0,0 +1,2 @@
|
||||||
|
https://example.com/1/,data 1
|
||||||
|
https://example.com/2/,data 2
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
webpages,other data
|
||||||
|
https://example.com/1/,data 1
|
||||||
|
https://example.com/2/,data 2
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def headerless_csv_file():
|
||||||
|
return "tests/data/csv_no_headers.csv"
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def header_csv_file():
|
||||||
|
return "tests/data/csv_with_headers.csv"
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def header_csv_file_non_default_column():
|
||||||
|
return "tests/data/csv_with_headers_non_default_column.csv"
|
||||||
|
|
||||||
|
|
||||||
|
def test_csv_feeder_no_headers(headerless_csv_file, setup_module):
|
||||||
|
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
|
||||||
|
|
||||||
|
feeder = setup_module(CSVFeeder, {"files": [headerless_csv_file]})
|
||||||
|
|
||||||
|
urls = list(feeder)
|
||||||
|
assert len(urls) == 2
|
||||||
|
assert urls[0].get_url() == "https://example.com/1/"
|
||||||
|
assert urls[1].get_url() == "https://example.com/2/"
|
||||||
|
|
||||||
|
def test_csv_feeder_with_headers(header_csv_file, setup_module):
|
||||||
|
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
|
||||||
|
|
||||||
|
feeder = setup_module(CSVFeeder, {"files": [header_csv_file]})
|
||||||
|
|
||||||
|
urls = list(feeder)
|
||||||
|
assert len(urls) == 2
|
||||||
|
assert urls[0].get_url() == "https://example.com/1/"
|
||||||
|
assert urls[1].get_url() == "https://example.com/2/"
|
||||||
|
|
||||||
|
def test_csv_feeder_wrong_column(header_csv_file, setup_module, caplog):
|
||||||
|
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
|
||||||
|
|
||||||
|
|
||||||
|
with caplog.at_level("WARNING"):
|
||||||
|
feeder = setup_module(CSVFeeder, {"files": [header_csv_file], "column": 1})
|
||||||
|
urls = list(feeder)
|
||||||
|
|
||||||
|
assert len(urls) == 0
|
||||||
|
assert "Not a valid URL in row" in caplog.text
|
||||||
|
assert len(caplog.records) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_csv_feeder_column_by_name(header_csv_file, setup_module):
|
||||||
|
from auto_archiver.modules.csv_feeder.csv_feeder import CSVFeeder
|
||||||
|
|
||||||
|
feeder = setup_module(CSVFeeder, {"files": [header_csv_file], "column": "webpages"})
|
||||||
|
|
||||||
|
urls = list(feeder)
|
||||||
|
assert len(urls) == 2
|
||||||
|
assert urls[0].get_url() == "https://example.com/1/"
|
||||||
|
assert urls[1].get_url() == "https://example.com/2/"
|
Ładowanie…
Reference in New Issue