# ![polars](media/polars_github_logo_rect_dark_name.svg)

## **Background**

- #### DataFrame library
- #### Written in Rust
- #### No external dependencies required
- #### Apache Arrow data format
    - https://editor.analyticsvidhya.com/uploads/57471columnar.jpg

## **Use Cases**

- #### Data Cleaning, Transformation and Preparation
- #### Data Analysis
- #### Data Visualization
- #### Time Series Analysis
- #### Machine Learning
- #### Data Import and Export

## **Concepts**

In [102]:
import polars as pl
import numpy as np

### Data Types

#### - Numeric
    - Signed, unsigned integers (8,16,32,64)
    - Float (32,64)
#### - Nested
    - Struct
    - List
#### - Temporal
    - Date
    - DateTime
    - Duration
    - Time
#### - Other
    - Boolean
    - String
    - Binary
    - Object
    - Categorical
    - Enum

### Data structures

#### *Series*

In [103]:
s = pl.Series("a", [1, 2, 3, 4, 5])
s

a
i64
1
2
3
4
5


In [104]:
s.mean()

3.0

#### *Data Frame*

In [105]:
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.597686,"""A"""
2.0,"""ham""",0.070548,"""A"""
3.0,"""spam""",0.308382,"""B"""
,"""egg""",0.857725,"""C"""
5.0,,0.467545,"""B"""


In [106]:
df.head(2)

nrs,names,random,groups
i64,str,f64,str
1,"""foo""",0.597686,"""A"""
2,"""ham""",0.070548,"""A"""


In [107]:
df.tail(2)

nrs,names,random,groups
i64,str,f64,str
,"""egg""",0.857725,"""C"""
5.0,,0.467545,"""B"""


In [108]:
df.describe()

describe,nrs,names,random,groups
str,f64,str,f64,str
"""count""",4.0,"""4""",5.0,"""5"""
"""null_count""",1.0,"""1""",0.0,"""0"""
"""mean""",2.75,,0.460377,
"""std""",1.707825,,0.296589,
"""min""",1.0,"""egg""",0.070548,"""A"""
"""25%""",2.0,,0.308382,
"""50%""",3.0,,0.467545,
"""75%""",3.0,,0.597686,
"""max""",5.0,"""spam""",0.857725,"""C"""


### Contexts

#### Selection

In [109]:
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.597686,"""A"""
2.0,"""ham""",0.070548,"""A"""
3.0,"""spam""",0.308382,"""B"""
,"""egg""",0.857725,"""C"""
5.0,,0.467545,"""B"""


In [110]:
q1 = df.select(
    pl.sum("nrs")
)
q1

nrs
i64
11


In [111]:
q2 = df.select(
    pl.sum("nrs"),
    pl.count("groups"),
)
q2

nrs,groups
i64,u32
11,5


In [113]:
q4 = df.select(
    pl.col("random") * 3.14,
    pl.count("groups"),
)
q4

random,groups
f64,u32
1.876734,5
0.22152,5
0.968319,5
2.693255,5
1.468092,5


In [114]:
q5 = df.with_columns(
    pl.col("random").max().alias("max"),
)
q5

nrs,names,random,groups,max
i64,str,f64,str,f64
1.0,"""foo""",0.597686,"""A""",0.857725
2.0,"""ham""",0.070548,"""A""",0.857725
3.0,"""spam""",0.308382,"""B""",0.857725
,"""egg""",0.857725,"""C""",0.857725
5.0,,0.467545,"""B""",0.857725


#### Filtering

In [115]:
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.597686,"""A"""
2.0,"""ham""",0.070548,"""A"""
3.0,"""spam""",0.308382,"""B"""
,"""egg""",0.857725,"""C"""
5.0,,0.467545,"""B"""


In [116]:
f1 = df.filter(
    pl.col("names") == "foo"
)
f1

nrs,names,random,groups
i64,str,f64,str
1,"""foo""",0.597686,"""A"""


In [117]:
f1 = df.filter(
    pl.col("random") > 0.5
)
f1

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.597686,"""A"""
,"""egg""",0.857725,"""C"""


#### Group by / aggregation

In [118]:
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.597686,"""A"""
2.0,"""ham""",0.070548,"""A"""
3.0,"""spam""",0.308382,"""B"""
,"""egg""",0.857725,"""C"""
5.0,,0.467545,"""B"""


In [119]:
g1 = df.group_by("groups").agg(
    pl.sum("nrs").alias("sum nrs"),
    pl.col("random").count().alias("count random"),
)
g1

groups,sum nrs,count random
str,i64,u32
"""A""",3,2
"""B""",8,2
"""C""",0,1


### Expressions

In [None]:
df

#### Operators

In [120]:
df_num = df.select(
    (pl.col("nrs") + 5).alias("nrs + 5"),
    (pl.col("nrs") * pl.col("random")).alias("nrs * random"),
)
df_num

nrs + 5,nrs * random
i64,f64
6.0,0.597686
7.0,0.141096
8.0,0.925145
,
10.0,2.337726


In [121]:
df_log = df.select(
    (pl.col("nrs") > 1).alias("nrs > 1"),
    (pl.col("nrs") == 1).alias("nrs == 1"),
    ((pl.col("random") <= 0.5) & (pl.col("nrs") > 1)).alias("and_expr"),
    ((pl.col("random") <= 0.5) | (pl.col("nrs") > 1)).alias("or_expr"), 
)
df_log

nrs > 1,nrs == 1,and_expr,or_expr
bool,bool,bool,bool
False,True,False,False
True,False,True,True
True,False,True,True
,,False,
True,False,True,True


#### Column selection

In [122]:
col_sel1 = df.select(pl.col("*"))
col_sel1

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.597686,"""A"""
2.0,"""ham""",0.070548,"""A"""
3.0,"""spam""",0.308382,"""B"""
,"""egg""",0.857725,"""C"""
5.0,,0.467545,"""B"""


In [123]:
col_sel2 = df.select(pl.col("^random|names$"))
col_sel2

names,random
str,f64
"""foo""",0.597686
"""ham""",0.070548
"""spam""",0.308382
"""egg""",0.857725
,0.467545


In [124]:
col_sel3 = df.select(pl.col(pl.Utf8))
col_sel3

names,groups
str,str
"""foo""","""A"""
"""ham""","""A"""
"""spam""","""B"""
"""egg""","""C"""
,"""B"""


#### String manipulation

In [125]:
string_df = pl.DataFrame(
    {
        "text": ["ptyhon", "polars is a dataframe library", "pandas"]
    }
)

In [126]:
s1 = string_df.select(
    pl.col("text"),
    pl.col("text").str.contains("python|polars").alias("contains"),
)
s1

text,contains
str,bool
"""ptyhon""",False
"""polars is a da…",True
"""pandas""",False


In [127]:
s2 = string_df.select(
    pl.col("text"),
    pl.col("text").str.extract("(pandas)").alias("extract"),
)
s2

text,extract
str,str
"""ptyhon""",
"""polars is a da…",
"""pandas""","""pandas"""


In [128]:
s3 = string_df.select(
    pl.col("text"),
    pl.col("text").str.replace("ptyhon", "python").alias("replace"),
)
s3

text,replace
str,str
"""ptyhon""","""python"""
"""polars is a da…","""polars is a da…"
"""pandas""","""pandas"""


#### Missing Data

In [129]:
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.597686,"""A"""
2.0,"""ham""",0.070548,"""A"""
3.0,"""spam""",0.308382,"""B"""
,"""egg""",0.857725,"""C"""
5.0,,0.467545,"""B"""


In [130]:
df_null = df.null_count()
df_null

nrs,names,random,groups
u32,u32,u32,u32
1,1,0,0


In [131]:
df_fill_lit = df.with_columns(
    pl.col("nrs").fill_null(pl.lit(12)),
)
df_fill_lit

nrs,names,random,groups
i64,str,f64,str
1,"""foo""",0.597686,"""A"""
2,"""ham""",0.070548,"""A"""
3,"""spam""",0.308382,"""B"""
12,"""egg""",0.857725,"""C"""
5,,0.467545,"""B"""


In [132]:
df_fill_strat = df.with_columns(
    pl.col("names").fill_null(strategy="forward"),
)
df_fill_strat

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.597686,"""A"""
2.0,"""ham""",0.070548,"""A"""
3.0,"""spam""",0.308382,"""B"""
,"""egg""",0.857725,"""C"""
5.0,"""egg""",0.467545,"""B"""


In [133]:
df_interpol = df.with_columns(
    pl.col("nrs").interpolate(),
)
df_interpol

nrs,names,random,groups
f64,str,f64,str
1.0,"""foo""",0.597686,"""A"""
2.0,"""ham""",0.070548,"""A"""
3.0,"""spam""",0.308382,"""B"""
4.0,"""egg""",0.857725,"""C"""
5.0,,0.467545,"""B"""


#### List

In [134]:
data = {
    'sensor': ["Sensor 1", "Sensor 2", "Sensor 3"],
    'temperature': [[21, 22, 999], [-1, None], [23, 23, 24, 22, 21]]
}

df_list = pl.DataFrame(data)
df_list

sensor,temperature
str,list[i64]
"""Sensor 1""","[21, 22, 999]"
"""Sensor 2""","[-1, null]"
"""Sensor 3""","[23, 23, … 21]"


In [135]:
l1 = df_list.select(
    pl.col("sensor"),
    pl.col("temperature").list.len().alias("observations")
)
l1

sensor,observations
str,u32
"""Sensor 1""",3
"""Sensor 2""",2
"""Sensor 3""",5


In [136]:
l2 = df_list.select(
    pl.col("sensor"),
    pl.col("temperature").list.mean().alias("avg. temp")
)
l2

sensor,avg. temp
str,f64
"""Sensor 1""",347.333333
"""Sensor 2""",-1.0
"""Sensor 3""",22.6


In [137]:
l3 = df_list.select(
    pl.col("sensor"),
    pl.col("temperature").list.eval(pl.element().filter(pl.element() < 30)).drop_nulls()
)
l3

sensor,temperature
str,list[i64]
"""Sensor 1""","[21, 22]"
"""Sensor 2""",[-1]
"""Sensor 3""","[23, 23, … 21]"


In [138]:
l4 = l3.select(
    pl.col("sensor"),
    pl.col("temperature").list.mean().alias("avg. temp")
)
l4

sensor,avg. temp
str,f64
"""Sensor 1""",21.5
"""Sensor 2""",-1.0
"""Sensor 3""",22.6


### **IO**

#### - CSV, Excel, Json
#### - Parquet files
#### - Databases

## **Example**

In [139]:
df_raw = pl.read_csv("data/netflix.csv")

In [140]:
df_raw.head()

show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
str,str,str,str,str,str,i64,str,str,str
"""s1""","""Movie""","""Dick Johnson I…","""Kirsten Johnso…","""United States""","""9/25/2021""",2020,"""PG-13""","""90 min""","""Documentaries"""
"""s3""","""TV Show""","""Ganglands""","""Julien Leclerc…","""France""","""9/24/2021""",2021,"""TV-MA""","""1 Season""","""Crime TV Shows…"
"""s6""","""TV Show""","""Midnight Mass""","""Mike Flanagan""","""United States""","""9/24/2021""",2021,"""TV-MA""","""1 Season""","""TV Dramas, TV …"
"""s14""","""Movie""","""Confessions of…","""Bruno Garotti""","""Brazil""","""9/22/2021""",2021,"""TV-PG""","""91 min""","""Children & Fam…"
"""s8""","""Movie""","""Sankofa""","""Haile Gerima""","""United States""","""9/24/2021""",1993,"""TV-MA""","""125 min""","""Dramas, Indepe…"


In [141]:
df_raw.tail()

show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
str,str,str,str,str,str,i64,str,str,str
"""s8797""","""TV Show""","""Yunus Emre""","""Not Given""","""Turkey""","""1/17/2017""",2016,"""TV-PG""","""2 Seasons""","""International …"
"""s8798""","""TV Show""","""Zak Storm""","""Not Given""","""United States""","""9/13/2018""",2016,"""TV-Y7""","""3 Seasons""","""Kids' TV"""
"""s8801""","""TV Show""","""Zindagi Gulzar…","""Not Given""","""Pakistan""","""12/15/2016""",2012,"""TV-PG""","""1 Season""","""International …"
"""s8784""","""TV Show""","""Yoko""","""Not Given""","""Pakistan""","""6/23/2018""",2016,"""TV-Y""","""1 Season""","""Kids' TV"""
"""s8786""","""TV Show""","""YOM""","""Not Given""","""Pakistan""","""6/7/2018""",2016,"""TV-Y7""","""1 Season""","""Kids' TV"""


In [142]:
df_raw.describe()

describe,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
str,str,str,str,str,str,str,f64,str,str,str
"""count""","""8790""","""8790""","""8790""","""8790""","""8790""","""8790""",8790.0,"""8790""","""8790""","""8790"""
"""null_count""","""0""","""0""","""0""","""0""","""0""","""0""",0.0,"""0""","""0""","""0"""
"""mean""",,,,,,,2014.183163,,,
"""std""",,,,,,,8.825466,,,
"""min""","""s1""","""Movie""","""#Alive""","""A. L. Vijay""","""Argentina""","""1/1/2008""",1925.0,"""G""","""1 Season""","""Action & Adven…"
"""25%""",,,,,,,2013.0,,,
"""50%""",,,,,,,2017.0,,,
"""75%""",,,,,,,2019.0,,,
"""max""","""s999""","""TV Show""","""최강전사 미니특공대 : 영…","""Şenol Sönmez""","""Zimbabwe""","""9/9/2021""",2021.0,"""UR""","""99 min""","""Thrillers"""


In [143]:
df_raw.glimpse()

Rows: 8790
Columns: 10
$ show_id      <str> 's1', 's3', 's6', 's14', 's8', 's9', 's10', 's939', 's13', 's940'
$ type         <str> 'Movie', 'TV Show', 'TV Show', 'Movie', 'Movie', 'TV Show', 'Movie', 'Movie', 'Movie', 'Movie'
$ title        <str> 'Dick Johnson Is Dead', 'Ganglands', 'Midnight Mass', 'Confessions of an Invisible Girl', 'Sankofa', 'The Great British Baking Show', 'The Starling', 'Motu Patlu in the Game of Zones', 'Je Suis Karl', 'Motu Patlu in Wonderland'
$ director     <str> 'Kirsten Johnson', 'Julien Leclercq', 'Mike Flanagan', 'Bruno Garotti', 'Haile Gerima', 'Andy Devonshire', 'Theodore Melfi', 'Suhas Kadav', 'Christian Schwochow', 'Suhas Kadav'
$ country      <str> 'United States', 'France', 'United States', 'Brazil', 'United States', 'United Kingdom', 'United States', 'India', 'Germany', 'India'
$ date_added   <str> '9/25/2021', '9/24/2021', '9/24/2021', '9/22/2021', '9/24/2021', '9/24/2021', '9/24/2021', '5/1/2021', '9/23/2021', '5/1/2021'
$ release_year <i64> 202

In [144]:
df = df_raw.with_columns(
    pl.col("date_added").str.to_datetime(format="%m/%d/%Y")
)
df

show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
str,str,str,str,str,datetime[μs],i64,str,str,str
"""s1""","""Movie""","""Dick Johnson I…","""Kirsten Johnso…","""United States""",2021-09-25 00:00:00,2020,"""PG-13""","""90 min""","""Documentaries"""
"""s3""","""TV Show""","""Ganglands""","""Julien Leclerc…","""France""",2021-09-24 00:00:00,2021,"""TV-MA""","""1 Season""","""Crime TV Shows…"
"""s6""","""TV Show""","""Midnight Mass""","""Mike Flanagan""","""United States""",2021-09-24 00:00:00,2021,"""TV-MA""","""1 Season""","""TV Dramas, TV …"
"""s14""","""Movie""","""Confessions of…","""Bruno Garotti""","""Brazil""",2021-09-22 00:00:00,2021,"""TV-PG""","""91 min""","""Children & Fam…"
"""s8""","""Movie""","""Sankofa""","""Haile Gerima""","""United States""",2021-09-24 00:00:00,1993,"""TV-MA""","""125 min""","""Dramas, Indepe…"
"""s9""","""TV Show""","""The Great Brit…","""Andy Devonshir…","""United Kingdom…",2021-09-24 00:00:00,2021,"""TV-14""","""9 Seasons""","""British TV Sho…"
"""s10""","""Movie""","""The Starling""","""Theodore Melfi…","""United States""",2021-09-24 00:00:00,2021,"""PG-13""","""104 min""","""Comedies, Dram…"
"""s939""","""Movie""","""Motu Patlu in …","""Suhas Kadav""","""India""",2021-05-01 00:00:00,2019,"""TV-Y7""","""87 min""","""Children & Fam…"
"""s13""","""Movie""","""Je Suis Karl""","""Christian Schw…","""Germany""",2021-09-23 00:00:00,2021,"""TV-MA""","""127 min""","""Dramas, Intern…"
"""s940""","""Movie""","""Motu Patlu in …","""Suhas Kadav""","""India""",2021-05-01 00:00:00,2013,"""TV-Y7""","""76 min""","""Children & Fam…"


In [145]:
df = df.cast({"type": pl.Enum(["TV Show", "Movie"]), "director": pl.Categorical, "rating": pl.Categorical,  "country": pl.Categorical,  "listed_in": pl.Categorical, "release_year": pl.UInt16})

In [146]:
df.dtypes

[String,
 Enum(categories=['TV Show', 'Movie']),
 String,
 Categorical(ordering='physical'),
 Categorical(ordering='physical'),
 Datetime(time_unit='us', time_zone=None),
 UInt16,
 Categorical(ordering='physical'),
 String,
 Categorical(ordering='physical')]

In [147]:
top_directors = df.group_by("director").agg(
    pl.count("show_id").alias("# of titles"),
).sort("# of titles", descending=True)

top_directors.head(10)

director,# of titles
cat,u32
"""Not Given""",2588
"""Rajiv Chilaka""",20
"""Raúl Campos, J…",18
"""Alastair Fothe…",18
"""Suhas Kadav""",16
"""Marcus Raboy""",16
"""Jay Karas""",14
"""Cathy Garcia-M…",13
"""Youssef Chahin…",12
"""Martin Scorses…",12


In [150]:
non_unique_rows = df.filter(df["title"].is_duplicated())
non_unique_rows

show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
str,enum,str,cat,cat,datetime[μs],u16,cat,str,cat


In [149]:
df = df.unique(subset=["title"],keep="first")
df

show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
str,enum,str,cat,cat,datetime[μs],u16,cat,str,cat
"""s3863""","""Movie""","""Despite Everyt…","""Gabriela Tagli…","""Spain""",2019-05-03 00:00:00,2019,"""TV-MA""","""79 min""","""Comedies, Inte…"
"""s7731""","""Movie""","""Personal Shopp…","""Olivier Assaya…","""France""",2019-02-01 00:00:00,2016,"""R""","""105 min""","""Dramas, Intern…"
"""s1342""","""Movie""","""Malcolm & Mari…","""Sam Levinson""","""United States""",2021-02-05 00:00:00,2021,"""R""","""106 min""","""Dramas, Indepe…"
"""s4815""","""Movie""","""W. Kamau Bell:…","""Shannon Hartma…","""United States""",2018-06-26 00:00:00,2018,"""TV-MA""","""66 min""","""Stand-Up Comed…"
"""s1201""","""TV Show""","""Waffles + Moch…","""Not Given""","""United States""",2021-03-16 00:00:00,2021,"""TV-Y""","""1 Season""","""Kids' TV"""
"""s3363""","""Movie""","""Arsenio Hall: …","""Brian Volk-Wei…","""United States""",2019-10-29 00:00:00,2019,"""TV-MA""","""63 min""","""Stand-Up Comed…"
"""s7970""","""Movie""","""Secret in Thei…","""Billy Ray""","""United States""",2018-04-01 00:00:00,2015,"""PG-13""","""111 min""","""Dramas, Thrill…"
"""s8485""","""Movie""","""The Rift: The …","""Dejan Zečević""","""Serbia""",2018-02-26 00:00:00,2016,"""TV-MA""","""92 min""","""Horror Movies,…"
"""s6959""","""Movie""","""Her""","""Spike Jonze""","""United States""",2018-07-29 00:00:00,2013,"""R""","""126 min""","""Dramas, Romant…"
"""s1712""","""TV Show""","""Graceful Frien…","""Not Given""","""South Korea""",2020-11-12 00:00:00,2020,"""TV-MA""","""1 Season""","""Crime TV Shows…"


In [151]:
total = df.height

In [152]:
movies_vs_series = df.group_by("type").agg(
    (pl.count("show_id").alias("# of titles") / total) * 100,
).sort("# of titles", descending=True)

movies_vs_series.head(10)

type,# of titles
enum,f64
"""Movie""",69.693866
"""TV Show""",30.306134


## **References**

- https://pola-rs.github.io/polars/
- https://realpython.com/polars-python/