diff --git a/Cargo.toml b/Cargo.toml index e392fb9..525c937 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "akimbo-ip" -version = "0.1.0" +version = "0.1.1" edition = "2021" [lib] diff --git a/README.md b/README.md index 0cce2cf..8d356de 100644 --- a/README.md +++ b/README.md @@ -20,14 +20,18 @@ Model ----- - IPv4 addresses are (fixed) length 4 bytestrings, but can be represented - by any 4-bye value, e.g., uint32 -- IPv6 addresses are (fixed) length 16 bytestrings + by any 4-bye value, e.g., uint32 or fixed-4-length list of uint8 +- IPv6 addresses are (fixed) length 16 bytestrings or fixed-16-length list + of uint8 - Networks are records with an IPv4 or IPv6 field (nominally "address") and - a uint8 field for the prefix length (nominally "prefix") + a uint8 field for the prefix length (nominally "prefix"). The field + names can be overidden. -We can convert between hostmasks, netmasks and prefix lengths. +We can convert between hostmasks, netmasks and prefix lengths. Some methods +require composite types like list-of-addresses, see the individual docstrings. -Some methods +As with the normal functioning of akimbo, you can indicate which parts of +a nested structure should be with the `where=` kwargs to any method. Usage ----- diff --git a/pyproject.toml b/pyproject.toml index d224dee..62ce58e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,9 @@ build-backend = "maturin" [project] name = "akimbo-ip" requires-python = ">=3.9" +description = "IP-specific methods for akimbo" +readme = "README.md" +license = { file = "LICENSE" } classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", @@ -14,7 +17,6 @@ dependencies = [ "akimbo" ] - [tool.maturin] features = ["pyo3/extension-module"] python-source = "python" diff --git a/python/akimbo_ip/accessor.py b/python/akimbo_ip/accessor.py index 585f4e5..3a34ce1 100644 --- a/python/akimbo_ip/accessor.py +++ b/python/akimbo_ip/accessor.py @@ -12,20 +12,28 @@ def match_ip4(arr): - # non-regular is not passed, might not all have right size + """matches fixed-list[4, u8] and fixed-bytestring[4] and ANY 4-byte value (like uint32, assumed big-endian""" return (arr.is_leaf and arr.dtype.itemsize == 4) or ( arr.is_regular and arr.size == 4 and arr.content.is_leaf and arr.content.dtype.itemsize == 1) def match_ip6(arr): + """matches fixed-list[16, u8] and fixed-bytestring[16]""" return arr.is_regular and arr.size == 16 and arr.content.is_leaf and arr.content.dtype.itemsize == 1 +def match_prefix(arr): + """A network prefix is always one byte""" + return arr.is_leaf and arr.dtype.itemsize == 1 + + def match_net4(arr, address="address", prefix="prefix"): + """Matches a record with IP4 field and prefix field (u8)""" return ( arr.is_record and {address, prefix}.issubset(arr.fields) and match_ip4(arr[address]) + and match_prefix(arr[prefix]) ) @@ -34,6 +42,7 @@ def match_net6(arr, address="address", prefix="prefix"): arr.is_record and {address, prefix}.issubset(arr.fields) and match_ip6(arr[address]) + and match_prefix(arr[prefix]) ) @@ -50,6 +59,15 @@ def parse_address4(str_arr): return utils.u8_to_ip4(out.view("uint8")) +def parse_address6(str_arr): + """Interpret (byte)strings as IPv6 addresses + + Output will be fixed length 4 bytestring array + """ + out = lib.parse6(str_arr.offsets.data.astype("uint32"), str_arr.content.data) + return utils.u8_to_ip6(out.view("uint8")) + + def parse_net4(str_arr): """Interpret (byte)strings as IPv4 networks (address/prefix) @@ -96,15 +114,23 @@ def hosts4(nets, address="address", prefix="prefix"): ) -def dec4(func, match=match_ip4, outtype=ak.contents.NumpyArray): +def to_ip4(arr): + if arr.is_leaf: + return arr.data.view("uint32"), + else: + # bytestring or 4 * uint8 regular + return arr.content.data.view("uint32"), + + +def to_ip6(arr): + # always pass as bytes, and assume length is mod 16 in rust + return arr.content.data.view("uint8"), + + +def dec_ip(func, conv=to_ip4, match=match_ip4, outtype=ak.contents.NumpyArray): @functools.wraps(func) def func1(arr): - if arr.is_leaf: - arr = arr.data.astype("uint32") - else: - # bytestring or 4 * uint8 regular - arr = arr.content.data.view("uint32") - return func(arr) + return func(*conv(arr)) return dec(func1, match=match, outtype=outtype, inmode="awkward") @@ -113,29 +139,42 @@ class IPAccessor: def __init__(self, accessor) -> None: self.accessor = accessor - is_unspecified4 = dec4(lib.is_unspecified4) - is_broadcast4 = dec4(lib.is_broadcast4) - is_global4 = dec4(lib.is_global4) - is_loopback4 = dec4(lib.is_loopback4) - is_private4 = dec4(lib.is_private4) - is_link_local4 = dec4(lib.is_link_local4) - is_shared4 = dec4(lib.is_shared4) - is_benchmarking4 = dec4(lib.is_benchmarking4) - is_reserved4 = dec4(lib.is_reserved4) - is_multicast4 = dec4(lib.is_multicast4) - is_documentation4 = dec4(lib.is_documentation4) + is_unspecified4 = dec_ip(lib.is_unspecified4) + is_broadcast4 = dec_ip(lib.is_broadcast4) + is_global4 = dec_ip(lib.is_global4) + is_loopback4 = dec_ip(lib.is_loopback4) + is_private4 = dec_ip(lib.is_private4) + is_link_local4 = dec_ip(lib.is_link_local4) + is_shared4 = dec_ip(lib.is_shared4) + is_benchmarking4 = dec_ip(lib.is_benchmarking4) + is_reserved4 = dec_ip(lib.is_reserved4) + is_multicast4 = dec_ip(lib.is_multicast4) + is_documentation4 = dec_ip(lib.is_documentation4) - to_string4 = dec4(lib.to_text4, outtype=utils.to_ak_string) + to_string4 = dec_ip(lib.to_text4, outtype=utils.to_ak_string) parse_address4 = dec(parse_address4, inmode="ak", match=match_stringlike) - parse_net4 = dec(parse_net4, inmode="ak", match=match_stringlike) contains4 = dec(contains4, inmode="ak", match=match_net4) - to_ipv6_mapped = dec(lib.to_ipv6_mapped, inmode="numpy", match=match_ip4, - outtype=utils.u8_to_ip6) + to_ipv6_mapped = dec_ip(lib.to_ipv6_mapped, outtype=utils.u8_to_ip6) hosts4 = dec(hosts4, match=match_net4, inmode="ak") + is_benchmarking6 = dec_ip(lib.is_benchmarking6, conv=to_ip6, match=match_ip6) + is_global6 = dec_ip(lib.is_global6, conv=to_ip6, match=match_ip6) + is_documentation6 = dec_ip(lib.is_documentation6, conv=to_ip6, match=match_ip6) + is_unspecified6 = dec_ip(lib.is_unspecified6, conv=to_ip6, match=match_ip6) + is_loopback6 = dec_ip(lib.is_loopback6, conv=to_ip6, match=match_ip6) + is_multicast6 = dec_ip(lib.is_multicast6, conv=to_ip6, match=match_ip6) + is_unicast6 = dec_ip(lib.is_unicast6, conv=to_ip6, match=match_ip6) + is_ipv4_mapped = dec_ip(lib.is_ipv4_mapped, conv=to_ip6, match=match_ip6) + is_unicast_link_local = dec_ip(lib.is_unicast_link_local, conv=to_ip6, match=match_ip6) + is_unique_local = dec_ip(lib.is_unique_local, conv=to_ip6, match=match_ip6) + + to_string6 = dec_ip(lib.to_text6, conv=to_ip6, match=match_ip6, outtype=utils.to_ak_string) + parse_address6 = dec(parse_address6, inmode="ak", match=match_stringlike) + + Accessor.register_accessor("ip", IPAccessor) diff --git a/src/lib.rs b/src/lib.rs index 74d4d0a..e9611a0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,7 @@ #![feature(addr_parse_ascii)] use pyo3::prelude::*; use core::net::Ipv4Addr; +use std::net::Ipv6Addr; use std::str::{self, FromStr}; use ipnet::Ipv4Net; use numpy::pyo3::Python; @@ -57,6 +58,34 @@ fn parse4<'py>(py: Python<'py>, offsets: PyReadonlyArray1<'py, u32>, Ok(out.into_pyarray_bound(py)) } +#[pyfunction] +fn to_text6<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u8>) +-> PyResult<(Bound<'py, PyArray1>, Bound<'py, PyArray1>)> { + let mut offsets: Vec = vec!(0, ); + let mut data: Vec = Vec::new(); + for sl in x.as_slice().unwrap().chunks_exact(16) + { + data.extend(Ipv6Addr::from_bits(u128::from_be_bytes(sl.try_into().unwrap())).to_string().as_bytes()); + offsets.push(data.len() as u32); + }; + Ok((data.into_pyarray_bound(py), offsets.into_pyarray_bound(py))) +} + +#[pyfunction] +fn parse6<'py>(py: Python<'py>, offsets: PyReadonlyArray1<'py, u32>, + data : PyReadonlyArray1<'py, u8> +) -> PyResult>> { + let ar = offsets.as_array(); + let sl = ar.as_slice().unwrap(); + let ar2 = data.as_array(); + let by = ar2.as_slice().unwrap(); + let mut out: Vec = Vec::with_capacity((sl.len() - 1) * 16); + for w in sl.windows(2) { + out.extend(Ipv6Addr::parse_ascii(&by[w[0] as usize..w[1] as usize]).unwrap().octets()) + }; + Ok(out.into_pyarray_bound(py)) +} + /// Parse strings into IP4 networks (length 4 bytestring and 1-byte prefix value) #[pyfunction] fn parsenet4<'py>(py: Python<'py>, @@ -176,6 +205,86 @@ fn is_documentation4<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u32>) -> PyR Ok(out.into_pyarray_bound(py)) } +#[pyfunction] +fn is_benchmarking6<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u8>) -> PyResult>> { + let out: Vec = x.as_slice().unwrap().chunks_exact(16).map(|sl | { + Ipv6Addr::from_bits(u128::from_be_bytes(sl.try_into().unwrap())).is_benchmarking() + }).collect(); + Ok(out.into_pyarray_bound(py)) +} + +#[pyfunction] +fn is_documentation6<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u8>) -> PyResult>> { + let out: Vec = x.as_slice().unwrap().chunks_exact(16).map(|sl | { + Ipv6Addr::from_bits(u128::from_be_bytes(sl.try_into().unwrap())).is_documentation() + }).collect(); + Ok(out.into_pyarray_bound(py)) +} + +#[pyfunction] +fn is_global6<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u8>) -> PyResult>> { + let out: Vec = x.as_slice().unwrap().chunks_exact(16).map(|sl | { + Ipv6Addr::from_bits(u128::from_be_bytes(sl.try_into().unwrap())).is_global() + }).collect(); + Ok(out.into_pyarray_bound(py)) +} + +#[pyfunction] +fn is_ipv4_mapped<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u8>) -> PyResult>> { + let out: Vec = x.as_slice().unwrap().chunks_exact(16).map(|sl | { + Ipv6Addr::from_bits(u128::from_be_bytes(sl.try_into().unwrap())).is_ipv4_mapped() + }).collect(); + Ok(out.into_pyarray_bound(py)) +} + +#[pyfunction] +fn is_loopback6<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u8>) -> PyResult>> { + let out: Vec = x.as_slice().unwrap().chunks_exact(16).map(|sl | { + Ipv6Addr::from_bits(u128::from_be_bytes(sl.try_into().unwrap())).is_loopback() + }).collect(); + Ok(out.into_pyarray_bound(py)) +} + +#[pyfunction] +fn is_multicast6<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u8>) -> PyResult>> { + let out: Vec = x.as_slice().unwrap().chunks_exact(16).map(|sl | { + Ipv6Addr::from_bits(u128::from_be_bytes(sl.try_into().unwrap())).is_multicast() + }).collect(); + Ok(out.into_pyarray_bound(py)) +} + +#[pyfunction] +fn is_unicast6<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u8>) -> PyResult>> { + let out: Vec = x.as_slice().unwrap().chunks_exact(16).map(|sl | { + Ipv6Addr::from_bits(u128::from_be_bytes(sl.try_into().unwrap())).is_unicast() + }).collect(); + Ok(out.into_pyarray_bound(py)) +} + +#[pyfunction] +fn is_unicast_link_local<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u8>) -> PyResult>> { + let out: Vec = x.as_slice().unwrap().chunks_exact(16).map(|sl | { + Ipv6Addr::from_bits(u128::from_be_bytes(sl.try_into().unwrap())).is_unicast_link_local() + }).collect(); + Ok(out.into_pyarray_bound(py)) +} + +#[pyfunction] +fn is_unique_local<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u8>) -> PyResult>> { + let out: Vec = x.as_slice().unwrap().chunks_exact(16).map(|sl | { + Ipv6Addr::from_bits(u128::from_be_bytes(sl.try_into().unwrap())).is_unique_local() + }).collect(); + Ok(out.into_pyarray_bound(py)) +} + +#[pyfunction] +fn is_unspecified6<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u8>) -> PyResult>> { + let out: Vec = x.as_slice().unwrap().chunks_exact(16).map(|sl | { + Ipv6Addr::from_bits(u128::from_be_bytes(sl.try_into().unwrap())).is_unspecified() + }).collect(); + Ok(out.into_pyarray_bound(py)) +} + #[pyfunction] fn to_ipv6_mapped<'py>(py: Python<'py>, x: PyReadonlyArray1<'py, u32>) -> PyResult>> { let mut out: Vec = Vec::with_capacity(x.len().unwrap() * 16); @@ -205,5 +314,18 @@ fn akimbo_ip(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(contains_one4, m)?)?; m.add_function(wrap_pyfunction!(to_ipv6_mapped, m)?)?; m.add_function(wrap_pyfunction!(hosts4, m)?)?; + + m.add_function(wrap_pyfunction!(is_benchmarking6, m)?)?; + m.add_function(wrap_pyfunction!(is_documentation6, m)?)?; + m.add_function(wrap_pyfunction!(is_global6, m)?)?; + m.add_function(wrap_pyfunction!(is_ipv4_mapped, m)?)?; + m.add_function(wrap_pyfunction!(is_loopback6, m)?)?; + m.add_function(wrap_pyfunction!(is_multicast6, m)?)?; + m.add_function(wrap_pyfunction!(is_unicast6, m)?)?; + m.add_function(wrap_pyfunction!(is_unicast_link_local, m)?)?; + m.add_function(wrap_pyfunction!(is_unique_local, m)?)?; + m.add_function(wrap_pyfunction!(is_unspecified6, m)?)?; + m.add_function(wrap_pyfunction!(to_text6, m)?)?; + m.add_function(wrap_pyfunction!(parse6, m)?)?; Ok(()) } \ No newline at end of file diff --git a/tests/test_core.py b/tests/test_core.py index 042be64..8572a39 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,3 +1,4 @@ +import pyarrow as pa import pandas as pd import pytest @@ -5,9 +6,12 @@ import akimbo_ip # registers .ip subaccessor +bytestring4 = pd.ArrowDtype(pa.binary(4)) +bytestring16 = pd.ArrowDtype(pa.binary(16)) + def test_simple4(): - s1 = pd.Series([0], dtype="uint32") + s1 = pd.Series([0], dtype="u4") out = s1.ak.ip.is_global4() assert out[0] is False out2 = s1.ak.ip.to_string4() @@ -18,6 +22,19 @@ def test_simple4(): assert out[0] == b"\x00\x00\x00\x00" +def test_simple6(): + s1 = pd.Series([b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"], + dtype=bytestring16) + out = s1.ak.ip.is_global6() + assert out.tolist() == [False, False] + + out2 = s1.ak.ip.to_string6() + assert out2.tolist() == ["::", "::1"] + out3 = out2.ak.ip.parse_address6() + assert out3[1] == s1[1] + + def test_nested(): s = pd.DataFrame({"a": [0], "b": [0]}).ak.merge() out = s.ak.ip.is_global4(where="b") @@ -44,14 +61,14 @@ def test_err(): def test_6_out(): - s1 = pd.Series([1], dtype="uint32") + s1 = pd.Series([1], dtype="u4") out = s1.ak.ip.to_ipv6_mapped() assert out[0] == b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x01' def test_rename(): - s = pd.DataFrame({"address": pd.Series([1], dtype="uint32"), - "end": [16]}).ak.merge() + s = pd.DataFrame({"address": pd.Series([1], dtype="u4"), + "end": pd.Series([16], dtype="u1")}).ak.merge() out = s.ak.ip.contains4(b"\x00\x00\x00\x01") assert s.tolist() == out.tolist() # no change, no match out = out.ak.ip.contains4(b"\x00\x00\x00\x01", match_kwargs={"prefix": "end"}) @@ -60,8 +77,8 @@ def test_rename(): def test_inner_list_hosts(): # note: both addresses are rounded down - s = pd.DataFrame({"address": pd.Series([1, 2], dtype="uint32"), - "prefix": [31, 29]}).ak.merge() + s = pd.DataFrame({"address": pd.Series([1, 2], dtype="u4"), + "prefix": pd.Series([31, 29], dtype="u1")}).ak.merge() out = s.ak.ip.hosts4() assert out.to_list() == [ # includes gateway/broadcast