This repository has been archived on 2024-07-22. You can view files and clone it, but cannot push or open issues or pull requests.
Zaimki/census/analyse.py

200 lines
6.7 KiB
Python
Raw Permalink Normal View History

2022-04-16 04:57:07 -07:00
from typing import Union
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.io as pio
import os
from datetime import datetime
import json
from argparse import ArgumentParser
import shutil
year = datetime.now().year
projectDir = Path(__file__).parent
inputDir = projectDir / 'input'
outputDir = projectDir.parent / 'locale' / 'pl' / 'docs' / ('spis-%s' % year) # projectDir / 'output'
openFigs = False
colours = ['#c71585']
colours_multi = ['#dd5fa6', '#8b0f7a', '#15c79c']
pd.options.mode.chained_assignment = None
def extractQuestion(
df: pd.DataFrame,
questionNumber: int,
includeAnswers: bool = True,
includeAggregates: bool = False,
removeUnderscores: bool = True
) -> pd.Series:
questionDf = df.filter(regex='^%s_%s(?!_writein)' % (
questionNumber,
('' if includeAnswers else 'aggr_') if includeAggregates else '(?!aggr)'
))
questionDf.columns = [
c[len(str(questionNumber)) + 1:]
.replace('aggr_', 'łącznie: ')
.replace('_', ' ' if removeUnderscores else '_')
.replace('łącznie: trans_', 'łącznie: trans*')
for c in questionDf.columns
]
questionDf = questionDf.sum()
questionDf = questionDf.apply(lambda x: round(100 * x / len(df), 1))
return questionDf
def generateBar(
data: Union[pd.DataFrame, pd.Series],
group: str,
name: str,
title: str,
show: bool = False
):
is_multi = type(data) is pd.DataFrame and len(data.columns) > 1
fig = px.bar(
data,
color_discrete_sequence=colours_multi if is_multi else colours,
barmode='group',
)
fig.update_layout(
showlegend=is_multi,
legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1, title=''),
title=title,
xaxis=None,
yaxis=None,
)
for trace in fig.select_traces():
trace.update(
hovertemplate='%{x}<br>%{y:.2f}%' + ('<br>%{meta}' if is_multi else '') + '<extra></extra>',
meta=trace.offsetgroup
)
pio.write_html(fig, file=outputDir / group / (name + '.html'), auto_open=show or openFigs, include_plotlyjs='cdn')
def percent(value: int, size: int, precision: int = 2) -> float:
return round(100 * value / size, precision)
def ensureEmptyDir(dir: Path) -> Path:
if os.path.exists(dir):
shutil.rmtree(dir)
os.makedirs(dir, exist_ok=True)
def analyse(group: str, df: pd.DataFrame, echo: bool = False):
ensureEmptyDir(outputDir / group)
stats = {
'size': len(df),
'age': pd.Series(buildAgesHistogram(df)),
'ageStats': {
'avg': round(df['age'].mean(), 1),
'median': round(df['age'].median(), 1),
'std': round(df['age'].std(), 1),
'under_30': percent(len(df[df['age'] < 30]), len(df)),
'adults': percent(len(df[df['age'] >= 18]), len(df)),
},
'pronounGroups': extractQuestion(df, 6),
'pronounGroupsAggr': extractQuestion(df, 6, includeAnswers=False, includeAggregates=True),
'pronouns': extractQuestion(df, 7),
'pronounsAggr': extractQuestion(df, 7, includeAnswers=False, includeAggregates=True),
'nouns': extractQuestion(df, 8), 'honorifics': extractQuestion(df, 9, includeAggregates=True),
'obstacles': extractQuestion(df, 10), 'reasons': extractQuestion(df, 12),
'groups': extractQuestion(df, 11), 'english': extractQuestion(df, 13, includeAggregates=True),
'labels': extractQuestion(df, 14, includeAggregates=True, removeUnderscores=False),
}
statsJson = json.dumps({
k: v.to_dict() if type(v) is pd.Series else v
for k, v
in stats.items()
}, indent=4)
if echo:
print('--- Group: %s ---' % group)
print(statsJson)
with open(outputDir / group / 'stats.json', 'w') as f:
f.write(statsJson)
return stats
def buildAgesHistogram(df: pd.DataFrame) -> pd.Series:
ages = [int(a) for a in df['age'].to_list() if a > 0]
agesHist = {i: 0 for i in range(min(ages), max(ages) + 1)}
for age in ages:
agesHist[age] += 1
s = len(ages)
return pd.Series({
age: percent(count, s, 3)
for age, count
in agesHist.items()
})
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('-s', '--show', dest='show', default=False, nargs='?', const=True)
parser.add_argument('-e', '--echo', dest='echo', default=False, nargs='?', const=True)
args = parser.parse_args()
if args.show:
openFigs = True
df = pd.read_csv(inputDir / 'export.csv')
df = df[df['0_'].isin(['osobą niebinarną', 'nie wiem'])]
df.loc[:, 'age'] = year - df['3_']
df.loc[df['age'] > 100, 'age'] = None
stats = {
'general': analyse('general', df, args.echo),
'location_poland': analyse('location_poland', df[df['4_'] == 'w Polsce'], args.echo),
'location_abroad': analyse('location_abroad', df[df['4_'] == 'za granicą'], args.echo),
'agab_f': analyse('agab_f', df[df['1_'] == 'żeńską'], args.echo),
'agab_m': analyse('agab_m', df[df['1_'] == 'męską'], args.echo),
# 'agab_x': analyse('agab_x', df[df['1_'] == 'inną (w jurysdykcjach, gdzie to możliwe)'], args.echo),
}
comparisons = {
'by_location': {
'general': 'Ogół',
'location_poland': 'Polska',
'location_abroad': 'Zagranica',
},
'by_agab': {
'general': 'Ogół',
'agab_f': 'AFAB',
'agab_m': 'AMAB',
},
}
graphs = {
'age': 'Wiek osób respondenckich',
'pronounGroups': 'Rodzaj gramatyczny używany w mowie',
2022-04-16 05:42:27 -07:00
'pronouns': 'Zaimki używane w piśmie',
2022-04-16 04:57:07 -07:00
'pronounsAggr': 'Zaimki używane w mowie i piśmie (zgrupowane)',
'nouns': 'Rzeczowniki',
'honorifics': 'Formy grzecznościowe',
'obstacles': 'Dlaczego nie formy niebinarne?',
'reasons': 'Co wpływa na wybór form?',
'groups': 'Formy do opisu grup mieszanych',
'english': 'Zaimki w języku angielskim',
'labels': 'Etykietki',
}
for group, group_stats in stats.items():
for graph, graph_label in graphs.items():
generateBar(group_stats[graph], group, graph, graph_label)
for comparison_key, comparison_groups in comparisons.items():
ensureEmptyDir(outputDir / comparison_key)
for graph, graph_label in graphs.items():
data = pd.DataFrame({
groupLabel: stats[group][graph]
for group, groupLabel
in comparison_groups.items()
})
generateBar(data, comparison_key, graph, graph_label)