PySparkで使い方に困ったときのメモ
複数カラムで集計
from pyspark.sql import functions as F a1 = df.groupby('column_1', 'column_2').agg(F.count(F.col('column_3')).alias('count'), F.collect_set('column_4').alias('collect_column_4'))
複数カラムでorderby
from pyspark.sql import functions as F a1 = df.select('column_1', 'column_2', 'column_3') \ .groupby('column_1', 'column_2') \ .agg(F.collect_set('column_3').alias("collect_column_3")) \ .orderBy(['column_1', 'column_2'], ascending=False) \
正規表現で置換
from pyspark.sql import functions as F a1 = df.withColumn('ip_mask_16', F.regexp_replace(F.col('ip'), '^(\\d+?\.+\\d+?\.).+', '$1'))