From 815d4be881f640ab4cd51a6196ac56777c5924d2 Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Thu, 31 May 2018 22:13:26 -0400 Subject: [PATCH 1/5] Fixed hanging error due to low memory on processing nodes, preset defaults changes --- app/boot.py | 12 +++--------- app/models/task.py | 21 ++++++++++++++++----- app/tests/test_api_preset.py | 2 +- app/tests/test_api_task.py | 18 ++++++++++++++++-- 4 files changed, 36 insertions(+), 17 deletions(-) diff --git a/app/boot.py b/app/boot.py index ef85792b..6c701299 100644 --- a/app/boot.py +++ b/app/boot.py @@ -98,22 +98,16 @@ def add_default_presets(): try: Preset.objects.update_or_create(name='DSM + DTM', system=True, defaults={ - 'options': [{'name': 'dsm', 'value': True}, {'name': 'dtm', 'value': True}, - {'name': 'mesh-octree-depth', 'value': 6}, - {'name': 'mesh-solver-divide', 'value': 6}]}) + 'options': [{'name': 'dsm', 'value': True}, {'name': 'dtm', 'value': True}]}) Preset.objects.update_or_create(name='Fast Orthophoto', system=True, defaults={'options': [{'name': 'fast-orthophoto', 'value': True}]}) - Preset.objects.update_or_create(name='High Quality', system=True, + Preset.objects.update_or_create(name='High Resolution', system=True, defaults={'options': [{'name': 'dsm', 'value': True}, - {'name': 'mesh-octree-depth', 'value': 6}, - {'name': 'mesh-solver-divide', 'value': 6}, {'name': 'dem-resolution', 'value': "0.04"}, {'name': 'orthophoto-resolution', 'value': "40"}, ]}) Preset.objects.update_or_create(name='Default', system=True, - defaults={'options': [{'name': 'dsm', 'value': True}, - {'name': 'mesh-octree-depth', 'value': 6}, - {'name': 'mesh-solver-divide', 'value': 6}]}) + defaults={'options': [{'name': 'dsm', 'value': True}]}) except MultipleObjectsReturned: # Mostly to handle a legacy code problem where # multiple system presets with the same name were diff --git a/app/models/task.py b/app/models/task.py index 9408abaa..b68a8a25 100644 --- a/app/models/task.py +++ b/app/models/task.py @@ -305,11 +305,22 @@ class Task(models.Model): # Processing node assigned, but is offline and no errors if self.processing_node and not self.processing_node.is_online(): - # Detach processing node, will be processed at the next tick - logger.info("Processing node {} went offline, reassigning {}...".format(self.processing_node, self)) - self.uuid = '' - self.processing_node = None - self.save() + # If we are queued up + # detach processing node, and reassignment + # will be processed at the next tick + if self.status == status_codes.QUEUED: + logger.info("Processing node {} went offline, reassigning {}...".format(self.processing_node, self)) + self.uuid = '' + self.processing_node = None + self.save() + else: + # Task was running and processing node went offline + # It could have crashed due to low memory + # or perhaps it went offline due to network errors. + # We can't easily differentiate between the two, so we need + # to notify the user because if it crashes because of low memory + # the user might need to take action. + raise ProcessingError("Processing node went offline. This could be due to insufficient memory or a network error.") if self.processing_node: # Need to process some images (UUID not yet set and task doesn't have pending actions)? diff --git a/app/tests/test_api_preset.py b/app/tests/test_api_preset.py index d857b310..777b0941 100644 --- a/app/tests/test_api_preset.py +++ b/app/tests/test_api_preset.py @@ -23,7 +23,7 @@ class TestApiPreset(BootTestCase): def check_default_presets(self): self.assertTrue(Preset.objects.filter(name="Default", system=True).exists()) self.assertTrue(Preset.objects.filter(name="DSM + DTM", system=True).exists()) - self.assertTrue(Preset.objects.filter(name="High Quality", system=True).exists()) + self.assertTrue(Preset.objects.filter(name="High Resolution", system=True).exists()) def test_preset(self): client = APIClient() diff --git a/app/tests/test_api_task.py b/app/tests/test_api_task.py index 95092838..6708ab87 100644 --- a/app/tests/test_api_task.py +++ b/app/tests/test_api_task.py @@ -570,9 +570,9 @@ class TestApiTask(BootTransactionTestCase): another_pnode.last_refreshed = timezone.now() another_pnode.save() - # Remove error + # Remove error, set status to queued task.last_error = None - task.status = None + task.status = status_codes.QUEUED task.save() worker.tasks.process_pending_tasks() @@ -586,6 +586,20 @@ class TestApiTask(BootTransactionTestCase): task.refresh_from_db() self.assertTrue(task.processing_node.id == another_pnode.id) + # Set task to queued, bring node offline + task.status = status_codes.RUNNING + task.save() + another_pnode.last_refreshed = timezone.now() - timedelta(minutes=OFFLINE_MINUTES) + another_pnode.save() + + worker.tasks.process_pending_tasks() + task.refresh_from_db() + + # Processing node is still there, but task should have failed + self.assertTrue(task.status == status_codes.FAILED) + self.assertTrue("Processing node went offline." in task.last_error) + + def test_task_manual_processing_node(self): user = User.objects.get(username="testuser") project = Project.objects.create(name="User Test Project", owner=user) From c89e87f9f8135bc0fd47d296543677a92c386944 Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Thu, 31 May 2018 22:15:29 -0400 Subject: [PATCH 2/5] Rephrase comment --- app/models/task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/models/task.py b/app/models/task.py index b68a8a25..a520d8d3 100644 --- a/app/models/task.py +++ b/app/models/task.py @@ -318,8 +318,8 @@ class Task(models.Model): # It could have crashed due to low memory # or perhaps it went offline due to network errors. # We can't easily differentiate between the two, so we need - # to notify the user because if it crashes because of low memory - # the user might need to take action. + # to notify the user because if it crashed due to low memory + # the user might need to take action (or be stuck in an infinite loop) raise ProcessingError("Processing node went offline. This could be due to insufficient memory or a network error.") if self.processing_node: From 2f18b61d840d26921fc563caf471d36360f00f51 Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Thu, 31 May 2018 22:23:37 -0400 Subject: [PATCH 3/5] Set status to none when reassigning queued task pnode --- app/models/task.py | 1 + app/tests/test_api_task.py | 1 + 2 files changed, 2 insertions(+) diff --git a/app/models/task.py b/app/models/task.py index a520d8d3..379bc3ae 100644 --- a/app/models/task.py +++ b/app/models/task.py @@ -312,6 +312,7 @@ class Task(models.Model): logger.info("Processing node {} went offline, reassigning {}...".format(self.processing_node, self)) self.uuid = '' self.processing_node = None + self.status = None self.save() else: # Task was running and processing node went offline diff --git a/app/tests/test_api_task.py b/app/tests/test_api_task.py index 6708ab87..f28ef8d5 100644 --- a/app/tests/test_api_task.py +++ b/app/tests/test_api_task.py @@ -585,6 +585,7 @@ class TestApiTask(BootTransactionTestCase): task.refresh_from_db() self.assertTrue(task.processing_node.id == another_pnode.id) + self.assertTrue(task.status == None) # Set task to queued, bring node offline task.status = status_codes.RUNNING From 8306fd75397aacf98252aad793047e408aaf53df Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Thu, 31 May 2018 22:54:55 -0400 Subject: [PATCH 4/5] unit test fix --- app/models/task.py | 4 ++-- app/tests/test_api_preset.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/app/models/task.py b/app/models/task.py index 379bc3ae..365747ff 100644 --- a/app/models/task.py +++ b/app/models/task.py @@ -301,6 +301,7 @@ class Task(models.Model): self.processing_node.save() logger.info("Automatically assigned processing node {} to {}".format(self.processing_node, self)) + self.status = None self.save() # Processing node assigned, but is offline and no errors @@ -312,9 +313,8 @@ class Task(models.Model): logger.info("Processing node {} went offline, reassigning {}...".format(self.processing_node, self)) self.uuid = '' self.processing_node = None - self.status = None self.save() - else: + elif self.status == status_codes.RUNNING: # Task was running and processing node went offline # It could have crashed due to low memory # or perhaps it went offline due to network errors. diff --git a/app/tests/test_api_preset.py b/app/tests/test_api_preset.py index 777b0941..8d01ed1e 100644 --- a/app/tests/test_api_preset.py +++ b/app/tests/test_api_preset.py @@ -55,7 +55,7 @@ class TestApiPreset(BootTestCase): # Only ours and global presets are available self.assertTrue(len(res.data) == 7) self.assertTrue('My Local Preset' in [preset['name'] for preset in res.data]) - self.assertTrue('High Quality' in [preset['name'] for preset in res.data]) + self.assertTrue('High Resolution' in [preset['name'] for preset in res.data]) self.assertTrue('Global Preset #1' in [preset['name'] for preset in res.data]) self.assertTrue('Global Preset #2' in [preset['name'] for preset in res.data]) self.assertFalse('Local Preset #1' in [preset['name'] for preset in res.data]) From b8665ad1812c36b744addc7200fdda0bdadae9cb Mon Sep 17 00:00:00 2001 From: Piero Toffanin Date: Thu, 31 May 2018 23:14:11 -0400 Subject: [PATCH 5/5] More unit test fixes --- app/models/task.py | 3 ++- app/tests/test_api_task.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/app/models/task.py b/app/models/task.py index 365747ff..d80f4faf 100644 --- a/app/models/task.py +++ b/app/models/task.py @@ -301,7 +301,6 @@ class Task(models.Model): self.processing_node.save() logger.info("Automatically assigned processing node {} to {}".format(self.processing_node, self)) - self.status = None self.save() # Processing node assigned, but is offline and no errors @@ -313,7 +312,9 @@ class Task(models.Model): logger.info("Processing node {} went offline, reassigning {}...".format(self.processing_node, self)) self.uuid = '' self.processing_node = None + self.status = None self.save() + elif self.status == status_codes.RUNNING: # Task was running and processing node went offline # It could have crashed due to low memory diff --git a/app/tests/test_api_task.py b/app/tests/test_api_task.py index f28ef8d5..4b90f476 100644 --- a/app/tests/test_api_task.py +++ b/app/tests/test_api_task.py @@ -580,14 +580,15 @@ class TestApiTask(BootTransactionTestCase): # Processing node is now cleared and a new one will be assigned on the next tick task.refresh_from_db() self.assertTrue(task.processing_node is None) + self.assertTrue(task.status is None) worker.tasks.process_pending_tasks() task.refresh_from_db() self.assertTrue(task.processing_node.id == another_pnode.id) - self.assertTrue(task.status == None) # Set task to queued, bring node offline + task.last_error = None task.status = status_codes.RUNNING task.save() another_pnode.last_refreshed = timezone.now() - timedelta(minutes=OFFLINE_MINUTES)