Examples

API Usage Examples

Backend Setup

"""
Shows the two ways of initializing a Parfun backend.

Usage:

    $ git clone https://github.com/Citi/parfun && cd parfun
    $ python -m examples.api_usage.backend_setup
"""

import parfun as pf


if __name__ == "__main__":
    # Set the parallel backend process-wise.
    pf.set_parallel_backend("local_multiprocessing")

    # Set the parallel backend with a Python context.
    with pf.set_parallel_backend_context("scaler_remote", scheduler_address="tcp://scaler.cluster:1243"):
        ...  # Will run the parallel tasks over a remotely setup Scaler cluster.

Partitioning API

all_arguments

"""
Uses `all_arguments` to partition all the input data of a parallel function.

Usage:

    $ git clone https://github.com/Citi/parfun && cd parfun
    $ python -m examples.api_usage.all_arguments
"""

import pandas as pd

import parfun as pf


@pf.parallel(
    split=pf.all_arguments(pf.dataframe.by_group(by=["year", "month"])),
    combine_with=pf.dataframe.concat,
)
def monthly_sum(sales: pd.DataFrame, costs: pd.DataFrame) -> pd.DataFrame:
    merged = pd.merge(sales, costs, on=["year", "month", "day"], how="outer")
    # Group and sum by day
    grouped = merged.groupby(["year", "month", "day"], as_index=False).sum(numeric_only=True)

    return grouped


if __name__ == "__main__":
    sales = pd.DataFrame({
        "year": [2024, 2024, 2024],
        "month": [1, 1, 2],
        "day": [1, 2, 1],
        "sales": [100, 200, 150]
    })

    costs = pd.DataFrame({
        "year": [2024, 2024, 2024],
        "month": [1, 1, 2],
        "day": [1, 2, 1],
        "costs": [50, 70, 80]
    })

    with pf.set_parallel_backend_context("local_multiprocessing"):
        result = monthly_sum(sales, costs)

    print(result)
    #     year  month  day  sales  costs
    # 0  2024      1    1    100     50
    # 1  2024      1    2    200     70
    # 2  2024      2    1    150     80

per_argument

"""
Uses `per_argument` to partition the input data from multiple arguments.

Usage:

    $ git clone https://github.com/Citi/parfun && cd parfun
    $ python -m examples.api_usage.per_argument
"""

from typing import List

import pandas as pd

import parfun as pf


@pf.parallel(
    split=pf.per_argument(
        factors=pf.py_list.by_chunk,
        dataframe=pf.dataframe.by_row,
    ),
    combine_with=pf.dataframe.concat,
)
def multiply_by_row(factors: List[int], dataframe: pd.DataFrame) -> pd.DataFrame:
    assert len(factors) == len(dataframe)
    return dataframe.multiply(factors, axis=0)


if __name__ == "__main__":
    dataframe = pd.DataFrame({
        "A": [1, 2, 3],
        "B": [4, 5, 6]
    })

    factors = [10, 20, 30]

    with pf.set_parallel_backend_context("local_multiprocessing"):
        result = multiply_by_row(factors, dataframe)

    print(result)
    #     A    B
    # 0  10   40
    # 1  40  100
    # 2  90  180

Custom Partition Function

"""
Shows how to use custom Python generators and functions as partitioning and combining functions.

Usage:

    $ git clone https://github.com/Citi/parfun && cd parfun
    $ python -m examples.api_usage.custom_generators
"""

from typing import Generator, Iterable, Tuple

import pandas as pd

import parfun as pf


def partition_by_day_of_week(dataframe: pd.DataFrame) -> Generator[Tuple[pd.DataFrame], None, None]:
    """Divides the computation on the "datetime" value, by day of the week (Monday, Tuesday ...)."""

    for _, partition in dataframe.groupby(dataframe["datetime"].dt.day_of_week):
        yield partition,  # Should always yield a tuple that matches the input parameters.


def combine_results(dataframes: Iterable[pd.DataFrame]) -> pd.DataFrame:
    """Collects the results by concatenating them, and make sure the values are kept sorted by date."""
    return pd.concat(dataframes).sort_values(by="datetime")


@pf.parallel(
    split=pf.all_arguments(partition_by_day_of_week),
    combine_with=combine_results,
)
def daily_mean(dataframe: pd.DataFrame) -> pd.DataFrame:
    return dataframe.groupby(dataframe["datetime"].dt.date).mean(numeric_only=True)


if __name__ == "__main__":
    dataframe = pd.DataFrame({
        # Probing times
        "datetime": pd.to_datetime([
            "2025-04-01 06:00", "2025-04-01 18:00", "2025-04-02 10:00", "2025-04-03 14:00", "2025-04-03 23:00",
            "2025-04-04 08:00", "2025-04-05 12:00", "2025-04-06 07:00", "2025-04-06 20:00", "2025-04-07 09:00",
            "2025-04-08 15:00", "2025-04-09 11:00", "2025-04-10 13:00", "2025-04-11 06:00", "2025-04-12 16:00",
            "2025-04-13 17:00", "2025-04-14 22:00", "2025-04-15 10:00", "2025-04-16 09:00", "2025-04-17 13:00",
            "2025-04-18 14:00", "2025-04-19 18:00", "2025-04-20 07:00", "2025-04-21 20:00", "2025-04-22 15:00",
        ]),
        # Temperature values (°C)
        "temperature": [
            7.2, 10.1, 9.8, 12.5, 11.7,
            8.9, 13.0, 7.5, 10.8, 9.3,
            12.1, 11.5, 13.3, 6.8, 12.7,
            13.5, 9.2, 10.0, 9.9, 11.8,
            12.4, 10.6, 7.9, 9.5, 11.6,
        ],
        # Humidity values (%)
        "humidity": [
            85, 78, 80, 75, 76,
            88, 73, 89, 77, 84,
            72, 74, 70, 90, 71,
            69, 86, 81, 83, 76,
            74, 79, 87, 82, 73,
        ]
    })

    with pf.set_parallel_backend_context("local_multiprocessing"):
        result = daily_mean(dataframe)

    print(result)

Enforced Partition Size

"""
Uses `all_arguments` to partition all the input data of a parallel function.

Usage:

    $ git clone https://github.com/Citi/parfun && cd parfun
    $ python -m examples.api_usage.partition_size
"""

import numpy as np
import pandas as pd

import parfun as pf


# With `fixed_partition_size`, the input dataframe will always be split in chunks of 1000 rows.
@pf.parallel(
    split=pf.all_arguments(pf.dataframe.by_row),
    combine_with=sum,
    fixed_partition_size=1000,
)
def fixed_partition_size_sum(dataframe: pd.DataFrame) -> float:
    return dataframe.values.sum()


# With `initial_partition_size`, the input dataframe will be split in chunks of 1000 rows until Parfun's
# machine-learning algorithm find a better estimate.
@pf.parallel(
    split=pf.all_arguments(pf.dataframe.by_row),
    combine_with=sum,
    initial_partition_size=1000,
)
def initial_partition_size_sum(dataframe: pd.DataFrame) -> float:
    return dataframe.values.sum()


# Both `fixed_partition_size` and `initial_partition_size` can accept a callable instead of an integer value. This
# allows for partition sizes to be computed based on the input parameters.
@pf.parallel(
    split=pf.all_arguments(pf.dataframe.by_row),
    combine_with=sum,
    initial_partition_size=lambda dataframe: max(10, len(dataframe) // 4),
)
def computed_partition_size_sum(dataframe: pd.DataFrame) -> float:
    return dataframe.values.sum()


if __name__ == "__main__":
    dataframe = pd.DataFrame(
        np.random.randint(0, 100, size=(100, 3)),
        columns=["alpha", "beta", "gamma"],
    )

    with pf.set_parallel_backend_context("local_multiprocessing"):
        print(fixed_partition_size_sum(dataframe))
        print(initial_partition_size_sum(dataframe))
        print(computed_partition_size_sum(dataframe))

Profiling

"""
Demonstrates the use of the `profile` and `trace_export` parameters for profiling Parfun's performances.

Usage:

    $ git clone https://github.com/Citi/parfun && cd parfun
    $ python -m examples.api_usage.profiling
"""

from typing import List
import random

import parfun as pf


@pf.parallel(
    split=pf.all_arguments(pf.py_list.by_chunk),
    combine_with=sum,
    profile=True,
    trace_export="parallel_sum_trace.csv",
)
def parallel_sum(values: List) -> List:
    return sum(values)


if __name__ == "__main__":
    N_VALUES = 100_000
    values = [random.randint(0, 99) for _ in range(0, N_VALUES)]

    with pf.set_parallel_backend_context("local_multiprocessing"):
        print("Sum =", parallel_sum(values))

Nested Parfun Calls

"""
Shows how a Parfun function can be called from within another Parfun function.

Usage:

    $ git clone https://github.com/Citi/parfun && cd parfun
    $ python -m examples.api_usage.nested_functions
"""

import pprint
import random
from typing import List

import parfun as pf


@pf.parallel(
    split=pf.all_arguments(pf.py_list.by_chunk),
    combine_with=pf.py_list.concat,
)
def add_vectors(vec_a: List, vec_b: List) -> List:
    """Add two vectors, element-wise."""
    return [a + b for a, b in zip(vec_a, vec_b)]


@pf.parallel(
    split=pf.all_arguments(pf.py_list.by_chunk),
    combine_with=pf.py_list.concat,
)
def add_matrices(mat_a: List[List], mat_b: List[List]) -> List[List]:
    """Add two matrices, row by row."""
    return [add_vectors(vec_a, vec_b) for vec_a, vec_b in zip(mat_a, mat_b)]


if __name__ == "__main__":
    N_ROWS, N_COLS = 10, 10

    mat_a = [[random.randint(0, 99) for _ in range(0, N_COLS)] for _ in range(0, N_ROWS)]
    mat_b = [[random.randint(0, 99) for _ in range(0, N_COLS)] for _ in range(0, N_ROWS)]

    print("A =")
    pprint.pprint(mat_a)

    print("B =")
    pprint.pprint(mat_b)

    with pf.set_parallel_backend_context("local_multiprocessing"):
        result = add_matrices(mat_a, mat_b)

    print("A + B =")
    pprint.pprint(result)

Application Examples

Count Bigrams In A Text Parallelly

"""
Counts the most common two-letters sequences (bigrams) in the content of an URL.

Usage:

    $ git clone https://github.com/Citi/parfun && cd parfun
    $ python -m examples.count_bigrams.main
"""

import collections
import psutil
import ssl

from typing import Counter, Iterable, List
from urllib.request import urlopen

import parfun as pf


def sum_counters(counters: Iterable[Counter[str]]) -> Counter[str]:
    return sum(counters, start=collections.Counter())


@pf.parallel(
    split=pf.per_argument(
        lines=pf.py_list.by_chunk
    ),
    combine_with=sum_counters,
)
def count_bigrams(lines: List[str]) -> Counter:
    counter: Counter[str] = collections.Counter()

    for line in lines:
        for word in line.split():
            for first, second in zip(word, word[1:]):
                bigram = f"{first}{second}"
                counter[bigram] += 1

    return counter


if __name__ == "__main__":
    N_WORKERS = psutil.cpu_count(logical=False)
    URL = "https://www.gutenberg.org/ebooks/100.txt.utf-8"
    TOP_K = 10

    with urlopen(URL, context=ssl._create_unverified_context()) as response:
        content = response.read().decode("utf-8").splitlines()

    with pf.set_parallel_backend_context("local_multiprocessing", max_workers=N_WORKERS):
        counts = count_bigrams(content)

    print(f"Top {TOP_K} words:")
    for word, count in counts.most_common(TOP_K):
        print(f"\t{word:<10}:\t{count}")

Parallel Training Random-Forest On Californian Housing Data

"""
Trains a random tree regressor on the California housing dataset from scikit-learn.

Measures the training time when splitting the learning dataset process using Parfun.

Usage:

    $ git clone https://github.com/Citi/parfun && cd parfun
    $ python -m examples.california_housing.main
"""

import psutil
import timeit

from typing import List

import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.base import RegressorMixin
from sklearn.tree import DecisionTreeRegressor

import parfun as pf


class MeanRegressor(RegressorMixin):
    def __init__(self, regressors: List[RegressorMixin]) -> None:
        super().__init__()
        self._regressors = regressors

    def predict(self, X):
        return np.mean([regressor.predict(X) for regressor in self._regressors])


@pf.parallel(
    split=pf.per_argument(dataframe=pf.dataframe.by_row),
    combine_with=lambda regressors: MeanRegressor(list(regressors))
)
def train_regressor(dataframe: pd.DataFrame, feature_names: List[str], target_name: str) -> RegressorMixin:

    regressor = DecisionTreeRegressor()
    regressor.fit(dataframe[feature_names], dataframe[[target_name]])

    return regressor


if __name__ == "__main__":
    N_WORKERS = psutil.cpu_count(logical=False)

    dataset = fetch_california_housing(download_if_missing=True)

    feature_names = dataset["feature_names"]
    target_name = dataset["target_names"][0]

    dataframe = pd.DataFrame(dataset["data"], columns=feature_names)
    dataframe[target_name] = dataset["target"]

    N_MEASURES = 5

    with pf.set_parallel_backend_context("local_single_process"):
        regressor = train_regressor(dataframe, feature_names, target_name)

        duration = (
            timeit.timeit(lambda: train_regressor(dataframe, feature_names, target_name), number=N_MEASURES)
            / N_MEASURES
        )

        print("Sequential training duration:", duration)

    with pf.set_parallel_backend_context("local_multiprocessing", max_workers=N_WORKERS):
        regressor = train_regressor(dataframe, feature_names, target_name)

        duration = (
            timeit.timeit(lambda: train_regressor(dataframe, feature_names, target_name), number=N_MEASURES)
            / N_MEASURES
        )

        print("Parallel training duration:", duration)

Compute Electricity Production Statistics Parallelly

"""
Based on the monthly electricity production data from ENTSO-E, plots the percentage of renewable energy production for
the European electricity grid.

Usage:

    $ git clone https://github.com/Citi/parfun && cd parfun
    $ pip install -r examples/requirements.txt
    $ python -m examples.europe_electricity.main [--plot]

"""

import sys
from typing import List

import pandas as pd

import parfun as pf


def fetch_production_data(year: int) -> pd.DataFrame:
    """
    Downloads the monthly production data for the given year.

    Sourced from https://www.entsoe.eu/data/power-stats/.
    """

    url = f"https://www.entsoe.eu/publications/data/power-stats/{year}/monthly_domestic_values_{year}.csv"

    result = pd.read_csv(url, sep=r"\t|,|;", engine="python")

    # Some newer datasets use "Area" instead of "Country"
    if "Area" in result.columns:
        result["Country"] = result["Area"]

    return result[["Year", "Month", "Category", "Country", "ProvidedValue"]]


def make_consumption_negative(production_data: pd.DataFrame) -> pd.DataFrame:
    """
    Make consumption values negative production values.

    Some production categories have positive consumption values (e.g. "Consumption of Hydro Water Reservoir"). This
    function transforms these values in their production counter parts, but with a negative value. This simplifies
    subsequent processing.
    """

    PREFIX = "Consumption of "

    result = production_data.copy()

    is_consumption = result["Category"].str.startswith(PREFIX)

    result.loc[is_consumption, "Category"] = result.loc[is_consumption, "Category"].str.replace(PREFIX, "", regex=False)
    result.loc[is_consumption, "ProvidedValue"] *= -1

    return result


def group_production_by_type(production_data: pd.DataFrame) -> pd.DataFrame:
    """Groups and sums all production data by type ("Fossil", "Nuclear", "Renewable" and "Other")."""

    fossil_sources = {
        "Fossil Gas", "Fossil Hard coal", "Fossil Oil",
        "Fossil Brown coal/Lignite", "Fossil Coal-derived gas",
        "Fossil Oil shale", "Fossil Peat",
    }
    nuclear_sources = {"Nuclear"}
    renewable_sources = {
        "Biomass", "Solar", "Wind Onshore", "Wind Offshore", "Geothermal",
        "Hydro Pumped Storage", "Hydro Run-of-river and poundage",
        "Hydro Water Reservoir", "Marine", "Other renewable",
    }

    def map_category(category: str) -> str:
        if category in fossil_sources:
            return "Fossil"
        elif category in nuclear_sources:
            return "Nuclear"
        elif category in renewable_sources:
            return "Renewable"
        else:
            return "Other"

    result = production_data.copy()

    result["EnergyType"] = result["Category"].map(map_category)
    del result["Category"]

    return result.groupby(["Year", "Month", "EnergyType"])["ProvidedValue"].sum().reset_index()


def monthly_percentage_production(production_data: pd.DataFrame) -> pd.DataFrame:
    """Returns the monthly production percentage for every month and every energy source type."""

    result = production_data.pivot_table(index=["Year", "Month"], columns="EnergyType", values="ProvidedValue")

    result = result.div(result.sum(axis=1), axis=0) * 100  # make it percentages

    # Uses datetime for year-month
    result.index = pd.to_datetime({
        "year": result.index.get_level_values(0),
        "month": result.index.get_level_values(1),
        "day": 1,
    })

    result.sort_index(ascending=True)  # sort by date

    return result


@pf.parallel(
    split=pf.all_arguments(pf.py_list.by_chunk),
    combine_with=pf.dataframe.concat,
    initial_partition_size=2,
)
def get_monthly_percentage_production(years: List[int]) -> pd.DataFrame:
    processed_yearly_data = []
    for year in years:
        yearly_production_data = fetch_production_data(year)

        yearly_production_data = make_consumption_negative(yearly_production_data)
        yearly_production_data = group_production_by_type(yearly_production_data)
        yearly_production_data = monthly_percentage_production(yearly_production_data)

        processed_yearly_data.append(yearly_production_data)

    return pd.concat(processed_yearly_data)


def plot_electricity_production(production_percentages: pd.DataFrame) -> None:
    import matplotlib.pyplot as plt

    colors = {
        "Fossil": "lightcoral",
        "Nuclear": "violet",
        "Renewable": "lightgreen",
        "Other": "lightsteelblue",
    }

    production_percentages.index = production_percentages.index.strftime("%b %Y")
    production_percentages.plot(kind="bar", stacked=True, figsize=(10, 6), width=1, color=colors)

    plt.title("Europe's monthly electricity production by source")
    plt.ylabel("Percentage (%)")
    plt.xlabel('Month')
    plt.legend(title="Energy source", loc='upper left')
    plt.grid(axis="y", linestyle="--")
    plt.ylim(0, 100)

    plt.tight_layout()
    plt.show()


def main():
    YEARS = list(range(2019, 2025))

    with pf.set_parallel_backend_context("local_multiprocessing"):
        processed_data = get_monthly_percentage_production(YEARS)

    if "--plot" in sys.argv[1:]:
        plot_electricity_production(processed_data)
    else:
        print(processed_data)


if __name__ == "__main__":
    main()

Compute Portfolio Metrics Parallelly

"""
Based on a portfolio of stocks, computes basic statistics.

Usage:

    $ git clone https://github.com/Citi/parfun && cd parfun
    $ python -m examples.portfolio_metrics.main
"""

from typing import List

import pandas as pd

import parfun as pf


@pf.parallel(
    split=pf.per_argument(portfolio=pf.dataframe.by_group(by="country")),
    combine_with=pf.dataframe.concat,
)
def relative_metrics(portfolio: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    """
    Computes relative metrics (difference to mean, median ...) of a dataframe, for each of the requested dataframe's
    values, grouped by country.
    """

    output = portfolio.copy()  # do not modify the input dataframe.

    for country in output["country"].unique():
        for column in columns:
            values = output.loc[output["country"] == country, column]

            mean = values.mean()
            std = values.std()

            output.loc[output["country"] == country, f"{column}_diff_to_mean"] = values - mean
            output.loc[output["country"] == country, f"{column}_sq_diff_to_mean"] = (values - mean) ** 2
            output.loc[output["country"] == country, f"{column}_relative_to_mean"] = (values - mean) / std

    return output


if __name__ == "__main__":
    portfolio = pd.DataFrame({
        "company": ["Apple", "Citigroup", "ASML", "Volkswagen", "Tencent"],
        "industry": ["technology", "banking", "technology", "manufacturing", "manufacturing"],
        "country": ["US", "US", "NL", "DE", "CN"],
        "market_cap": [2828000000000, 80310000000, 236000000000, 55550000000, 345000000000],
        "revenue": [397000000000, 79840000000, 27180000000, 312000000000, 79000000000],
        "workforce": [161000, 240000, 39850, 650951, 104503]
    })

    with pf.set_parallel_backend_context("local_multiprocessing"):
        metrics = relative_metrics(portfolio, ["market_cap", "revenue"])

    print(metrics)