make (partial) disaggregation easier #199

gdementen · 2017-04-04T08:50:58Z

We need to find some way to make this easier (and fix all the bugs I just came across) -- I will open separate issues for each bug, but the main feature request will remain:

>>> arr = ndtest(10)
>>> a = arr.a
>>> agg = arr.sum(a.by(2)).rename('a', 'group')
>>> agg
group | a0:a1 | a2:a3 | a4:a5 | a6:a7 | a8:a9
      |     1 |     5 |     9 |    13 |    17
>>> group = agg.group
>>> expand = zeros(group, a)
>>> expand
group | a0:a1 | a2:a3 | a4:a5 | a6:a7 | a8:a9
      |   0.0 |   0.0 |   0.0 |   0.0 |   0.0
>>> expand = zeros((group, a))
>>> expand
group\a |  a0 |  a1 |  a2 |  a3 |  a4 |  a5 |  a6 |  a7 |  a8 |  a9
  a0:a1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a2:a3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a4:a5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a6:a7 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a8:a9 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
>>> expand['a0:a1', 'a0':'a1'] = 0.5
ValueError                                Traceback (most recent call last)
...
ValueError: a0:a1 is ambiguous (valid in group, a)
>>> expand[group['a0:a1'], 'a0':'a1'] = 0.5
ValueError                                Traceback (most recent call last)
...
ValueError: group['a0':'a1'] is not a valid label for any axis
>>> expand[x.group['a0:a1'], 'a0':'a1'] = 0.5
>>> expand
group\a |  a0 |  a1 |  a2 |  a3 |  a4 |  a5 |  a6 |  a7 |  a8 |  a9
  a0:a1 | 0.5 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a2:a3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a4:a5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a6:a7 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a8:a9 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
>>> for g in group:
...     print(g.split(':'))
AttributeError                            Traceback (most recent call last)
...
AttributeError: 'PGroup' object has no attribute 'split'
>>> for g in group:
...     print(g.eval().split(':'))
['a0', 'a1']
['a2', 'a3']
['a4', 'a5']
['a6', 'a7']
['a8', 'a9']
>>> for g in group:
...     expand[g, slice(*g.eval().split(':'))] = 1 / len(g)
>>> expand
group\a |  a0 |  a1 |  a2 |  a3 |  a4 |  a5 |  a6 |  a7 |  a8 |  a9
  a0:a1 | 0.2 | 0.2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a2:a3 | 0.0 | 0.0 | 0.2 | 0.2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a4:a5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.2 | 0.2 | 0.0 | 0.0 | 0.0 | 0.0
  a6:a7 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.2 | 0.2 | 0.0 | 0.0
  a8:a9 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.2 | 0.2
>>> len(group.i[0])
5
>>> for g in group:
...     expand[g, slice(*g.eval().split(':'))] = 0.5
>>> expand
group\a |  a0 |  a1 |  a2 |  a3 |  a4 |  a5 |  a6 |  a7 |  a8 |  a9
  a0:a1 | 0.5 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a2:a3 | 0.0 | 0.0 | 0.5 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a4:a5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.5 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0
  a6:a7 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.5 | 0.5 | 0.0 | 0.0
  a8:a9 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.5 | 0.5
>>> agg
group | a0:a1 | a2:a3 | a4:a5 | a6:a7 | a8:a9
      |     1 |     5 |     9 |    13 |    17
>>> agg * expand
group\a |  a0 |  a1 |  a2 |  a3 |  a4 |  a5 |  a6 |  a7 |  a8 |  a9
  a0:a1 | 0.5 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a2:a3 | 0.0 | 0.0 | 2.5 | 2.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0
  a4:a5 | 0.0 | 0.0 | 0.0 | 0.0 | 4.5 | 4.5 | 0.0 | 0.0 | 0.0 | 0.0
  a6:a7 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.5 | 6.5 | 0.0 | 0.0
  a8:a9 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 8.5 | 8.5
>>> (agg * expand).sum(group)
a |  a0 |  a1 |  a2 |  a3 |  a4 |  a5 |  a6 |  a7 |  a8 |  a9
  | 0.5 | 0.5 | 2.5 | 2.5 | 4.5 | 4.5 | 6.5 | 6.5 | 8.5 | 8.5
>>> arr
a | a0 | a1 | a2 | a3 | a4 | a5 | a6 | a7 | a8 | a9
  |  0 |  1 |  2 |  3 |  4 |  5 |  6 |  7 |  8 |  9

gdementen · 2017-04-04T10:48:46Z

see #200, #202 and #203 for linked issues.

gdementen · 2017-04-05T14:12:41Z

The loop can be rewritten more generically as:

>>> for g in group:
...     start, stop = g.eval().split(':')
...     target = a[start:stop]
...     expand[g, target] = 1 / len(target)

gdementen · 2017-04-05T14:14:33Z

and (agg * expand).sum(group) is equivalent to simply agg @ expand in this case

gdementen · 2017-04-06T10:05:12Z

A first proof of concept:

def disag_array(array, source_axis, target_axis, mapping=None, fixoverlap=True):
    source_axis = array.axes[source_axis]
    disag_array = zeros((source_axis, target_axis))
    for source_group in source_axis:
        source_label = source_group.eval()
        target_labels = source_label if mapping is None else mapping[source_label]
        target_group = target_axis[target_labels]
        disag_array[source_group, target_group] = 1 / len(target_group)
    if fixoverlap:
        disag_array /= (disag_array > 0).sum(source_axis)
    return disag_array

# make a method out of this
def disag(array, source_axis, target_axis, mapping=None):
    return array @ disag_array(array, source_axis, target_axis, mapping)

then

>>> arr = ndtest(3)
>>> arr
a | a0 | a1 | a2
  |  0 |  1 |  2
>>> agg = arr.sum('a1:a2;a0:a1').rename('a', 'group')
>>> agg
group | a1:a2 | a0:a1
      |     3 |     1
>>> disag(agg, x.group, arr.a)
a |  a0 |  a1 |  a2
  | 0.5 | 1.0 | 1.5

gdementen · 2017-04-06T10:39:20Z

see also:
http://stackoverflow.com/questions/12223689/how-can-i-efficiently-disaggregate-data-in-a-dataframe-given-a-set-of-weights

gdementen · 2017-04-25T14:33:21Z

Here is a new version:

def disag_array(array, source_axis, groups=None, target_axis=None, fixoverlap=True):
    source_axis = array.axes[source_axis]
    if isinstance(groups, collections.Sequence):
        assert len(groups) == len(source_axis)
        if target_axis is None:
            target_axis = Axis(groups[0].axis, np.unique(np.concatenate([g.eval() for g in groups])))
        groups = dict(zip(source_axis, groups))
    if isinstance(groups, collections.Mapping):    
        if target_axis is None:
            groups = [groups[source_group.eval()] for source_group in source_axis]
            groups_labels = [g.eval() if isinstance(g, Group) else g for g in groups]
            target_name = groups[0].axis if isinstance(groups[0], Group) else None
            target_axis = Axis(target_name, np.unique(np.concatenate(groups_labels)))
    if target_axis is None:
        raise ValueError('must specify groups, target_axis or both')

    disag_array = zeros((source_axis, target_axis))
    for source_group in source_axis:
        target_labels = source_group.eval() if groups is None else groups[source_group]
        # make sure we have a group in case mapping returned raw labels
        target_group = target_axis[target_labels]
        disag_array[source_group, target_group] = 1 / len(target_group)
    if fixoverlap:
        disag_array /= (disag_array > 0).sum(source_axis)
    return disag_array

# make a method out of this
def disag(array, source_axis, groups=None, target_axis=None):
    return (array * disag_array(array, source_axis, groups, target_axis)).sum(source_axis)

then:

>>> disag(agg2, x.group, (arr.a['a1:a2'], arr.a['a0:a1']))
a |  a0 |  a1 |  a2
  | 0.5 | 1.0 | 1.5

>>> disag(agg2, x.group, target_axis=arr.a)
a |  a0 |  a1 |  a2
  | 0.5 | 1.0 | 1.5

>>> disag(agg, agegr, age.by(5))

points to improve:

I would like to support passing a single {agg_label: group} mapping. e.g.

>>> disag(agg, dict(zip(agegr, age.by(5))))
>>> disag(agg, {'a1:a2': ['a1', 'a2'], 'a0:a1': ['a0', 'a1']})

np.unique sorts the labels which we should not (at least by default)
the mapping is a bit fragile vs Group or raw labels as keys or as values. We should support the 4 different combinations

gdementen · 2017-05-08T10:43:51Z

Note that the @ optimization currently only works when the disag array is 2D, mostly due to the fact that @ uses the axes positions. It should be possible to transpose to make it work in all cases. Not sure it is worth it though.

gdementen · 2023-03-20T10:23:24Z

Katia needs this too but to disaggregate only a few labels. I don't know if the above code already works for that case or not. We should make sure it works though.

gdementen · 2023-03-20T10:43:39Z

The implementation so far assumes the aggregated cells correspond to sums. We should support not dividing by the length of the group (if the aggregation was a mean).

Are there other ways/methods to automatically infer the values (possibly user-defined function???).

If users want an uneven split, I guess we should redirect them (probably worth mentioning in the disag function documentation) to use an explicit disag array.

We could also (probably in addition to mentioning the explicit disag array) provide some special syntax to create it (for disaggregating only a few labels, providing a disag array would seem overkill).

>>> disag(agg, {'a1:a2': {'a1': 0.4, 'a2': 0.6}, 'a0:a1': {'a0': 0.7, 'a1': 0.3}})

gdementen · 2023-10-02T14:21:21Z

Partial disaggregation is done quite often in user models:

>>> arr = ndtest('axis_v1=a0,a12,a3')
>>> arr
axis_v1  a0  a12  a3
          0    1   2
>>> arr.rename('axis_v1', 'axis_v2').set_labels('axis_v2', {'a12': 'a1'}).insert(0, after='a1', label = 'a2')
axis_v2  a0  a1  a2  a3
          0   1   0   2

It would be nice if we had something nicer for this. The special syntax above would help:

>>> disag(arr, {'a12': {'a1': 1, 'a2': 0}})
axis_v2  a0  a1  a2  a3
          0   1   0   2

gdementen added the enhancement label Apr 4, 2017

gdementen changed the title ~~make reverse aggregation easier~~ make disaggregation easier Apr 4, 2017

alixdamman added this to the nice_to_have milestone Mar 7, 2018

alixdamman added the difficulty: high label Mar 7, 2018

gdementen modified the milestones: nice_to_have, 0.31 Jan 9, 2019

gdementen removed this from the 0.31 milestone Aug 1, 2019

alixdamman added this to the nice_to_have milestone Oct 10, 2019

gdementen changed the title ~~make disaggregation easier~~ make (partial) disaggregation easier Mar 20, 2023

gdementen added the priority: high label Mar 20, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

make (partial) disaggregation easier #199

make (partial) disaggregation easier #199

gdementen commented Apr 4, 2017 •

edited

Loading

gdementen commented Apr 4, 2017

Uh oh!

gdementen commented Apr 5, 2017

Uh oh!

gdementen commented Apr 5, 2017

Uh oh!

gdementen commented Apr 6, 2017 •

edited

Loading

Uh oh!

gdementen commented Apr 6, 2017

Uh oh!

gdementen commented Apr 25, 2017 •

edited

Loading

Uh oh!

gdementen commented May 8, 2017 •

edited

Loading

Uh oh!

gdementen commented Mar 20, 2023 •

edited

Loading

Uh oh!

gdementen commented Mar 20, 2023 •

edited

Loading

Uh oh!

gdementen commented Oct 2, 2023 •

edited

Loading

Uh oh!

make (partial) disaggregation easier #199

make (partial) disaggregation easier #199

Comments

gdementen commented Apr 4, 2017 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

gdementen commented Apr 4, 2017

Uh oh!

gdementen commented Apr 5, 2017

Uh oh!

gdementen commented Apr 5, 2017

Uh oh!

gdementen commented Apr 6, 2017 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

gdementen commented Apr 6, 2017

Uh oh!

gdementen commented Apr 25, 2017 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

gdementen commented May 8, 2017 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

gdementen commented Mar 20, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

gdementen commented Mar 20, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

gdementen commented Oct 2, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

gdementen commented Apr 4, 2017 •

edited

Loading

gdementen commented Apr 6, 2017 •

edited

Loading

gdementen commented Apr 25, 2017 •

edited

Loading

gdementen commented May 8, 2017 •

edited

Loading

gdementen commented Mar 20, 2023 •

edited

Loading

gdementen commented Mar 20, 2023 •

edited

Loading

gdementen commented Oct 2, 2023 •

edited

Loading