Polars
DataFrames for the new era.
import polars as pl
# filter
df.filter(pl.col("age") >= 10, pl.col("Salary") > 52_000)
# create new columns. reassign var if you want to keep it.
df = df.with_columns((pl.col("Salary") / 12).round(2).alias("Monthly Earnings"))
# drop a column
df = df.drop("Monthly Engineering")
# sort ascending by Salary
df.sort("Salary", descending=True)
# filter and aggregate afterwards. print max salary
df.filter(pl.col("age") > 30)['Salary'].max()
# group by departmenet and run aggregate function over the result.
df.group_by("Department").agg([
pl.col("Salary").mean().alias("average salary"),
pl.col("Age").min().alias("Youngest age")
])
# count NULL values in a column
df.null_count()
# shows with rows in a column are null
df.select(pl.col("Salary").is_null()
# show rows that are not null
df.filter(pl.col("salary").is_not_null())
# fill null values with literal value 50
df.with_columns(pl.col("salary").fill_null(pl.lit(50)))
# fill 'f'orward. take last valid value fill it in until next non-null value.
df.with_columns(pl.col("salary").fill_null(strategy='f'))
# fill 'b'ackward
df.with_columns(pl.col("salary").fill_null(strategy='b'))
# interpolate
df.with_columns(pl.col("salary").interpolate())
# Serializing
# write
df.write_csv("data.csv")
df.write_json("data.json")
# read. we can specify the column type (e.g. turn the date string into an actual date type)
df2 = pl.read_csv("data.csv")
Plotting
df["Age"].plot.bar()
df.plot.scatter(x="Age", y="Salary", by="Name")
## Conversion
* `to_numpy()` method to convert data to numpy format.