From 2d8e977337782e46cd189b7211bb0965c089e905 Mon Sep 17 00:00:00 2001 From: Maarten Bosmans Date: Mon, 28 Nov 2022 20:23:25 +0100 Subject: [PATCH 1/2] Some small cleanup User-visible strings should be quoted with " and ' is for internal strings. --- ipfn/ipfn.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py index 6b2181b..3d9e8b0 100755 --- a/ipfn/ipfn.py +++ b/ipfn/ipfn.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python from __future__ import print_function import numpy as np import pandas as pd @@ -41,7 +40,7 @@ def __init__(self, original, aggregates, dimensions, weight_col='total', self.conv_rate = convergence_rate self.max_itr = max_iteration if verbose not in [0, 1, 2]: - raise(ValueError(f"wrong verbose input, must be either 0, 1 or 2 but got {verbose}")) + raise ValueError(f"wrong verbose input, must be either 0, 1 or 2 but got {verbose}") self.verbose = verbose self.rate_tolerance = rate_tolerance @@ -74,7 +73,7 @@ def ipfn_np(self, m, aggregates, dimensions, weight_col='total'): m = IPF.iteration() """ - # Check that the inputs are numpay arrays of floats + # Check that the inputs are numpy arrays of floats inc = 0 for aggregate in aggregates: if not isinstance(aggregate, np.ndarray): @@ -142,7 +141,7 @@ def ipfn_np(self, m, aggregates, dimensions, weight_col='total'): ori_ijk = aggregates[inc][item] m_slice = m[idx] m_ijk = m_slice.sum() - # print('Current vs original', abs(m_ijk/ori_ijk - 1)) + # print("Current vs original", abs(m_ijk/ori_ijk - 1)) if abs(m_ijk / ori_ijk - 1) > max_conv: max_conv = abs(m_ijk / ori_ijk - 1) @@ -268,7 +267,7 @@ def iteration(self): ipfn_method = self.ipfn_np self.original = self.original.astype('float64') else: - raise(ValueError(f'Data input instance not recognized. The input matrix is not a numpy array or pandas DataFrame')) + raise ValueError(f"Data input instance not recognized. The input matrix is not a numpy array or pandas DataFrame") while ((i <= self.max_itr and conv > self.conv_rate) and (i <= self.max_itr and abs(conv - old_conv) > self.rate_tolerance)): old_conv = conv m, conv = ipfn_method(m, self.aggregates, self.dimensions, self.weight_col) @@ -277,11 +276,11 @@ def iteration(self): converged = 1 if i <= self.max_itr: if (not conv > self.conv_rate) & (self.verbose > 1): - print('ipfn converged: convergence_rate below threshold') + print("ipfn converged: convergence_rate below threshold") elif not abs(conv - old_conv) > self.rate_tolerance: - print('ipfn converged: convergence_rate not updating or below rate_tolerance') + print("ipfn converged: convergence_rate not updating or below rate_tolerance") else: - print('Maximum iterations reached') + print("Maximum iterations reached") converged = 0 # Handle the verbose @@ -292,4 +291,4 @@ def iteration(self): elif self.verbose == 2: return m, converged, pd.DataFrame({'iteration': range(i), 'conv': conv_list}).set_index('iteration') else: - raise(ValueError(f'wrong verbose input, must be either 0, 1 or 2 but got {self.verbose}')) + raise ValueError(f"wrong verbose input, must be either 0, 1 or 2 but got {self.verbose}") From ae61437248b4fa7665589e8da09212943b32b251 Mon Sep 17 00:00:00 2001 From: Maarten Bosmans Date: Mon, 28 Nov 2022 20:29:28 +0100 Subject: [PATCH 2/2] Rework iteration loop The max_itr is now more explicit and the convergence criteria are only checked in one place instead of in the while condition and after the loop. The loop now actually stops after doing max_itr iterations, where previously it would do iterations numbered 0 to max_itr, i.e. one iteration too many. --- ipfn/ipfn.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py index 3d9e8b0..db4a19a 100755 --- a/ipfn/ipfn.py +++ b/ipfn/ipfn.py @@ -253,14 +253,12 @@ def iteration(self): """ Runs the ipfn algorithm. Automatically detects of working with numpy ndarray or pandas dataframes. """ - - i = 0 - conv = np.inf - old_conv = -np.inf + old_conv = np.inf conv_list = [] + converged = 1 m = self.original - # If the original data input is in pandas DataFrame format + # Prepare input data if isinstance(self.original, pd.DataFrame): ipfn_method = self.ipfn_df elif isinstance(self.original, np.ndarray): @@ -268,17 +266,19 @@ def iteration(self): self.original = self.original.astype('float64') else: raise ValueError(f"Data input instance not recognized. The input matrix is not a numpy array or pandas DataFrame") - while ((i <= self.max_itr and conv > self.conv_rate) and (i <= self.max_itr and abs(conv - old_conv) > self.rate_tolerance)): - old_conv = conv + + # Run iterations + for i in range(self.max_itr): m, conv = ipfn_method(m, self.aggregates, self.dimensions, self.weight_col) conv_list.append(conv) - i += 1 - converged = 1 - if i <= self.max_itr: - if (not conv > self.conv_rate) & (self.verbose > 1): - print("ipfn converged: convergence_rate below threshold") - elif not abs(conv - old_conv) > self.rate_tolerance: + if conv <= self.conv_rate: + if self.verbose > 1: + print("ipfn converged: convergence_rate below threshold") + break + if abs(conv - old_conv) <= self.rate_tolerance: print("ipfn converged: convergence_rate not updating or below rate_tolerance") + break + old_conv = conv else: print("Maximum iterations reached") converged = 0 @@ -289,6 +289,6 @@ def iteration(self): elif self.verbose == 1: return m, converged elif self.verbose == 2: - return m, converged, pd.DataFrame({'iteration': range(i), 'conv': conv_list}).set_index('iteration') + return m, converged, pd.DataFrame({'iteration': range(1, i+2), 'conv': conv_list}).set_index('iteration') else: raise ValueError(f"wrong verbose input, must be either 0, 1 or 2 but got {self.verbose}")