jq-style Pipelines with plumbum¶
This notebook demonstrates how to use pdum.plumbum.jq to recreate jq-style transformations in Python. We start with simple pipelines and then mirror each of the 43 scenarios from docs/jq-fu-43-examples.md.
In [1]:
Copied!
from __future__ import annotations
import base64
import json
import re
from datetime import datetime, timezone
from itertools import product
from urllib.parse import quote_plus
from pdum.plumbum import pb
from pdum.plumbum.iterops import chain, dedup, select, where
from pdum.plumbum.jq import (
coalesce,
delete_path,
explode,
field,
iter_paths,
resolve_path,
set_path,
transform,
walk_tree,
)
from pdum.plumbum.jq.typing import Field, Index
from __future__ import annotations
import base64
import json
import re
from datetime import datetime, timezone
from itertools import product
from urllib.parse import quote_plus
from pdum.plumbum import pb
from pdum.plumbum.iterops import chain, dedup, select, where
from pdum.plumbum.jq import (
coalesce,
delete_path,
explode,
field,
iter_paths,
resolve_path,
set_path,
transform,
walk_tree,
)
from pdum.plumbum.jq.typing import Field, Index
Helper utilities¶
In [2]:
Copied!
def path_tokens(path):
return [Field(p) if isinstance(p, str) else Index(p) for p in path]
def drop_empty(tree):
result = tree
changed = True
while changed:
changed = False
for path, value in list(walk_tree(result)):
if not path:
continue
if value in (None, "", []) or value == {}:
result = delete_path(result, path_tokens(path))
changed = True
break
return result
def remove_keys(tree, names):
result = tree
for path, _ in list(walk_tree(result)):
if path and isinstance(path[-1], str) and path[-1] in names:
result = delete_path(result, path_tokens(path))
return result
def stringify_non_ascii(tree):
result = tree
for path, value in list(walk_tree(result)):
if path and isinstance(value, str) and any(ord(ch) > 127 for ch in value):
result = set_path(result, path_tokens(path), json.dumps(value))
return result
def mask_secret_like(tree):
pattern = re.compile(r"secret|token|password", re.IGNORECASE)
result = tree
for path, _ in list(walk_tree(result)):
if path and isinstance(path[-1], str) and pattern.search(path[-1]):
result = set_path(result, path_tokens(path), "***")
return result
def leaf_paths(tree):
return [
{"path": list(path), "value": value} for path, value in walk_tree(tree) if not isinstance(value, (dict, list))
]
def path_tokens(path):
return [Field(p) if isinstance(p, str) else Index(p) for p in path]
def drop_empty(tree):
result = tree
changed = True
while changed:
changed = False
for path, value in list(walk_tree(result)):
if not path:
continue
if value in (None, "", []) or value == {}:
result = delete_path(result, path_tokens(path))
changed = True
break
return result
def remove_keys(tree, names):
result = tree
for path, _ in list(walk_tree(result)):
if path and isinstance(path[-1], str) and path[-1] in names:
result = delete_path(result, path_tokens(path))
return result
def stringify_non_ascii(tree):
result = tree
for path, value in list(walk_tree(result)):
if path and isinstance(value, str) and any(ord(ch) > 127 for ch in value):
result = set_path(result, path_tokens(path), json.dumps(value))
return result
def mask_secret_like(tree):
pattern = re.compile(r"secret|token|password", re.IGNORECASE)
result = tree
for path, _ in list(walk_tree(result)):
if path and isinstance(path[-1], str) and pattern.search(path[-1]):
result = set_path(result, path_tokens(path), "***")
return result
def leaf_paths(tree):
return [
{"path": list(path), "value": value} for path, value in walk_tree(tree) if not isinstance(value, (dict, list))
]
Warm-up¶
In [3]:
Copied!
users = [
{"id": 1, "name": "Ada", "scores": [10, 15]},
{"id": 2, "name": "Linus", "scores": [20]},
]
names = users > (select(field("name")) | pb(list))
score_sum = sum(list(users > explode("[].scores")))
names, score_sum
users = [
{"id": 1, "name": "Ada", "scores": [10, 15]},
{"id": 2, "name": "Linus", "scores": [20]},
]
names = users > (select(field("name")) | pb(list))
score_sum = sum(list(users > explode("[].scores")))
names, score_sum
Out[3]:
(['Ada', 'Linus'], 45)
Example 1 — Tag child items with parent fields¶
jq
jq
.[] | .name as $n | .group as $g | .items[] | {name:$n, group:$g, item:.}
In [4]:
Copied!
groups = [
{"name": "alpha", "group": "A", "items": ["h1", "h2"]},
{"name": "beta", "group": "B", "items": ["h3"]},
]
tagged = groups > (
explode("[]")
| select(
lambda parent: [{"name": parent["name"], "group": parent["group"], "item": item} for item in parent["items"]]
)
| chain
| pb(list)
)
tagged
groups = [
{"name": "alpha", "group": "A", "items": ["h1", "h2"]},
{"name": "beta", "group": "B", "items": ["h3"]},
]
tagged = groups > (
explode("[]")
| select(
lambda parent: [{"name": parent["name"], "group": parent["group"], "item": item} for item in parent["items"]]
)
| chain
| pb(list)
)
tagged
Out[4]:
[{'name': 'alpha', 'group': 'A', 'item': 'h1'},
{'name': 'alpha', 'group': 'A', 'item': 'h2'},
{'name': 'beta', 'group': 'B', 'item': 'h3'}]
In [5]:
Copied!
catalog = {"items": ["gala", "fuji", "braeburn"]}
items = list(catalog > explode("items"))
[{"idx": idx, "item": item} for idx, item in enumerate(items)]
catalog = {"items": ["gala", "fuji", "braeburn"]}
items = list(catalog > explode("items"))
[{"idx": idx, "item": item} for idx, item in enumerate(items)]
Out[5]:
[{'idx': 0, 'item': 'gala'},
{'idx': 1, 'item': 'fuji'},
{'idx': 2, 'item': 'braeburn'}]
Example 3 — Flatten nested arrays with lineage¶
jq
jq
.[] | .id as $pid | .buckets[] as $b | $b.items[] | {parent:$pid, bucket:$b.name, item:.}
In [6]:
Copied!
projects = [
{
"id": "p-1",
"buckets": [
{"name": "red", "items": ["r1", "r2"]},
{"name": "blue", "items": ["b1"]},
],
},
{"id": "p-2", "buckets": [{"name": "green", "items": ["g1", "g2"]}]},
]
flattened = projects > (
explode("[]")
| select(
lambda proj: [
{"parent": proj["id"], "bucket": bucket["name"], "item": item}
for bucket in proj["buckets"]
for item in bucket["items"]
]
)
| chain
| pb(list)
)
flattened
projects = [
{
"id": "p-1",
"buckets": [
{"name": "red", "items": ["r1", "r2"]},
{"name": "blue", "items": ["b1"]},
],
},
{"id": "p-2", "buckets": [{"name": "green", "items": ["g1", "g2"]}]},
]
flattened = projects > (
explode("[]")
| select(
lambda proj: [
{"parent": proj["id"], "bucket": bucket["name"], "item": item}
for bucket in proj["buckets"]
for item in bucket["items"]
]
)
| chain
| pb(list)
)
flattened
Out[6]:
[{'parent': 'p-1', 'bucket': 'red', 'item': 'r1'},
{'parent': 'p-1', 'bucket': 'red', 'item': 'r2'},
{'parent': 'p-1', 'bucket': 'blue', 'item': 'b1'},
{'parent': 'p-2', 'bucket': 'green', 'item': 'g1'},
{'parent': 'p-2', 'bucket': 'green', 'item': 'g2'}]
Example 4 — Explode object fields to key/value records¶
jq
jq
.props | to_entries[] | {key:.key, value:.value}
In [7]:
Copied!
record = {"props": {"cpu": "m2", "ram": "32gb"}}
[{"key": path[-1], "value": value} for path, value in iter_paths(record, "props.*")]
record = {"props": {"cpu": "m2", "ram": "32gb"}}
[{"key": path[-1], "value": value} for path, value in iter_paths(record, "props.*")]
Out[7]:
[{'key': 'cpu', 'value': 'm2'}, {'key': 'ram', 'value': '32gb'}]
Example 5 — Multi-condition filter with defaults¶
jq
jq
select((.status // "unknown") == "ok" and (.lat? // 0) != 0)
In [8]:
Copied!
readings = [
{"id": 1, "status": "ok", "lat": 51.5},
{"id": 2, "lat": 0.0},
{"id": 3, "status": "ok", "lat": 0},
{"id": 4, "status": None, "lat": 48.1},
]
filtered = readings > (
where(lambda row: (row > coalesce("status", default="unknown")) == "ok" and (row > coalesce("lat", default=0)) != 0)
| pb(list)
)
filtered
readings = [
{"id": 1, "status": "ok", "lat": 51.5},
{"id": 2, "lat": 0.0},
{"id": 3, "status": "ok", "lat": 0},
{"id": 4, "status": None, "lat": 48.1},
]
filtered = readings > (
where(lambda row: (row > coalesce("status", default="unknown")) == "ok" and (row > coalesce("lat", default=0)) != 0)
| pb(list)
)
filtered
Out[8]:
[{'id': 1, 'status': 'ok', 'lat': 51.5}]
Example 6 — Drop null/empty fields recursively¶
jq
jq
walk(if type=="object" then with_entries(select(.value!=null and .value!=[] and .value!={})) else . end)
In [9]:
Copied!
raw = {
"meta": {"source": "ingest", "tags": [], "notes": None},
"items": [
{"id": 1, "extra": {}},
{"id": 2, "extra": {"comment": "ok"}},
],
"misc": "",
}
drop_empty(raw)
raw = {
"meta": {"source": "ingest", "tags": [], "notes": None},
"items": [
{"id": 1, "extra": {}},
{"id": 2, "extra": {"comment": "ok"}},
],
"misc": "",
}
drop_empty(raw)
Out[9]:
{'meta': {'source': 'ingest'},
'items': [{'id': 1}, {'id': 2, 'extra': {'comment': 'ok'}}]}
In [10]:
Copied!
catalog_entry = {"sku": "A-100", "tags": None}
{
**catalog_entry,
"price": catalog_entry > coalesce("price", default=0),
"tags": catalog_entry > coalesce("tags", default=[]),
}
catalog_entry = {"sku": "A-100", "tags": None}
{
**catalog_entry,
"price": catalog_entry > coalesce("price", default=0),
"tags": catalog_entry > coalesce("tags", default=[]),
}
Out[10]:
{'sku': 'A-100', 'tags': [], 'price': 0}
Example 8 — Remove noisy keys anywhere¶
jq
jq
walk(if type=="object" then del(.debug,.temp,.trace) else . end)
In [11]:
Copied!
payload = {
"debug": {"state": "verbose"},
"data": {"value": 3, "trace": "abc", "nested": {"temp": 42, "value": 9}},
}
remove_keys(payload, {"debug", "temp", "trace"})
payload = {
"debug": {"state": "verbose"},
"data": {"value": 3, "trace": "abc", "nested": {"temp": 42, "value": 9}},
}
remove_keys(payload, {"debug", "temp", "trace"})
Out[11]:
{'data': {'value': 3, 'nested': {'value': 9}}}
Example 9 — Parse with regex groups¶
jq
jq
.label | capture("^(?<cat>[A-Z]{2})-(?<id>\d+)$") | {category:.cat, id:(.id|tonumber)}
In [12]:
Copied!
labelled = {"label": "IN-204"}
match_obj = re.match(r"^(?P<cat>[A-Z]{2})-(?P<id>\d+)$", labelled > field("label"))
{"category": match_obj.group("cat"), "id": int(match_obj.group("id"))}
labelled = {"label": "IN-204"}
match_obj = re.match(r"^(?P[A-Z]{2})-(?P\d+)$", labelled > field("label"))
{"category": match_obj.group("cat"), "id": int(match_obj.group("id"))}
Out[12]:
{'category': 'IN', 'id': 204}
In [13]:
Copied!
endpoint = {"path": "/api/v2/accounts/42"}
endpoint > transform("path", lambda value: re.sub(r"^/api/v\d+/", "/api/latest/", value))
endpoint = {"path": "/api/v2/accounts/42"}
endpoint > transform("path", lambda value: re.sub(r"^/api/v\d+/", "/api/latest/", value))
Out[13]:
{'path': '/api/latest/accounts/42'}
In [14]:
Copied!
article = {"category": "blog", "id": 42, "title": 123}
{
"slug": f"{article['category']}/{article['id']}",
"title": str(article["title"]),
}
article = {"category": "blog", "id": 42, "title": 123}
{
"slug": f"{article['category']}/{article['id']}",
"title": str(article["title"]),
}
Out[14]:
{'slug': 'blog/42', 'title': '123'}
Example 12 — Safe numeric coercion¶
jq
jq
.price = ((.price? // 0) | tonumber) | .qty = ((.qty? // 1) | tonumber)
In [15]:
Copied!
row = {"price": "12.5", "qty": None}
{
"price": float(row > coalesce("price", default=0)),
"qty": int(row > coalesce("qty", default=1)),
}
row = {"price": "12.5", "qty": None}
{
"price": float(row > coalesce("price", default=0)),
"qty": int(row > coalesce("qty", default=1)),
}
Out[15]:
{'price': 12.5, 'qty': 1}
In [16]:
Copied!
invoice = {"total": 123.4567}
invoice > transform("total", lambda value: round(((value or 0) * 100)) / 100)
invoice = {"total": 123.4567}
invoice > transform("total", lambda value: round(((value or 0) * 100)) / 100)
Out[16]:
{'total': 123.46}
Example 14 — ISO8601 ↔ epoch¶
jq
jq
.ts = (.timestamp|fromdateiso8601) | .ts_human = (.ts|todateiso8601)
In [17]:
Copied!
event = {"timestamp": "2024-01-05T12:30:00Z"}
instant = datetime.fromisoformat(event["timestamp"].replace("Z", "+00:00"))
{
**event,
"ts": int(instant.replace(tzinfo=timezone.utc).timestamp()),
"ts_human": instant.isoformat(),
}
event = {"timestamp": "2024-01-05T12:30:00Z"}
instant = datetime.fromisoformat(event["timestamp"].replace("Z", "+00:00"))
{
**event,
"ts": int(instant.replace(tzinfo=timezone.utc).timestamp()),
"ts_human": instant.isoformat(),
}
Out[17]:
{'timestamp': '2024-01-05T12:30:00Z',
'ts': 1704457800,
'ts_human': '2024-01-05T12:30:00+00:00'}
Example 15 — Minute buckets¶
jq
jq
. | (.timestamp|fromdateiso8601/60|floor) as $m | {minute:$m, event:.}
In [18]:
Copied!
raw_events = [
{"timestamp": "2024-02-01T10:00:05Z", "event": "login"},
{"timestamp": "2024-02-01T10:00:45Z", "event": "heartbeat"},
{"timestamp": "2024-02-01T10:01:10Z", "event": "logout"},
]
def minute_bucket(row):
dt = datetime.fromisoformat(row["timestamp"].replace("Z", "+00:00"))
minute = int(dt.replace(tzinfo=timezone.utc).timestamp() // 60)
return {"minute": minute, "event": row}
[minute_bucket(row) for row in raw_events]
raw_events = [
{"timestamp": "2024-02-01T10:00:05Z", "event": "login"},
{"timestamp": "2024-02-01T10:00:45Z", "event": "heartbeat"},
{"timestamp": "2024-02-01T10:01:10Z", "event": "logout"},
]
def minute_bucket(row):
dt = datetime.fromisoformat(row["timestamp"].replace("Z", "+00:00"))
minute = int(dt.replace(tzinfo=timezone.utc).timestamp() // 60)
return {"minute": minute, "event": row}
[minute_bucket(row) for row in raw_events]
Out[18]:
[{'minute': 28446360,
'event': {'timestamp': '2024-02-01T10:00:05Z', 'event': 'login'}},
{'minute': 28446360,
'event': {'timestamp': '2024-02-01T10:00:45Z', 'event': 'heartbeat'}},
{'minute': 28446361,
'event': {'timestamp': '2024-02-01T10:01:10Z', 'event': 'logout'}}]
Example 16 — Merge parent keys into child¶
jq
jq
.name as $n | .region as $r | .items[] | . + {name:$n, region:$r}
In [19]:
Copied!
regions = [
{"name": "alpha", "region": "EMEA", "items": [{"id": 1}, {"id": 2}]},
{"name": "beta", "region": "NA", "items": [{"id": 3}]},
]
merged = regions > (
explode("[]")
| select(lambda parent: [{**item, "name": parent["name"], "region": parent["region"]} for item in parent["items"]])
| chain
| pb(list)
)
merged
regions = [
{"name": "alpha", "region": "EMEA", "items": [{"id": 1}, {"id": 2}]},
{"name": "beta", "region": "NA", "items": [{"id": 3}]},
]
merged = regions > (
explode("[]")
| select(lambda parent: [{**item, "name": parent["name"], "region": parent["region"]} for item in parent["items"]])
| chain
| pb(list)
)
merged
Out[19]:
[{'id': 1, 'name': 'alpha', 'region': 'EMEA'},
{'id': 2, 'name': 'alpha', 'region': 'EMEA'},
{'id': 3, 'name': 'beta', 'region': 'NA'}]
In [20]:
Copied!
doc = {"meta": {"source": "ingest-0"}}
set_path(doc, "meta.source", "ingest-1")
doc = {"meta": {"source": "ingest-0"}}
set_path(doc, "meta.source", "ingest-1")
Out[20]:
{'meta': {'source': 'ingest-1'}}
In [21]:
Copied!
order = {"items": [{"sku": "a", "price": 10.0}, {"sku": "b", "price": 12.0}]}
transform(order, "items[].price", lambda price: round(price * 1.05, 2))
order = {"items": [{"sku": "a", "price": 10.0}, {"sku": "b", "price": 12.0}]}
transform(order, "items[].price", lambda price: round(price * 1.05, 2))
Out[21]:
<transform>(*({'items': [{'sku': 'a', 'price': 10.0}, {'sku': 'b', 'price': 12.0}]}, 'items[].price', <function <lambda> at 0x7f024408e160>), **{})
In [22]:
Copied!
stream = [
{"id": 1, "value": "first"},
{"id": 1, "value": "duplicate"},
{"id": 2, "value": "second"},
]
list(stream > (dedup(lambda row: row["id"]) | pb(list)))
stream = [
{"id": 1, "value": "first"},
{"id": 1, "value": "duplicate"},
{"id": 2, "value": "second"},
]
list(stream > (dedup(lambda row: row["id"]) | pb(list)))
Out[22]:
[{'id': 1, 'value': 'first'}, {'id': 2, 'value': 'second'}]
Example 20 — Running totals¶
jq
jq
foreach inputs as $r (0; . + ($r.value // 0); {ts:$r.ts, running:.})
In [23]:
Copied!
records = [
{"ts": "00:00", "value": 5},
{"ts": "00:01", "value": 3},
{"ts": "00:02", "value": None},
{"ts": "00:03", "value": 2},
]
total = 0
running = []
for row in records:
total += row.get("value") or 0
running.append({"ts": row["ts"], "running": total})
running
records = [
{"ts": "00:00", "value": 5},
{"ts": "00:01", "value": 3},
{"ts": "00:02", "value": None},
{"ts": "00:03", "value": 2},
]
total = 0
running = []
for row in records:
total += row.get("value") or 0
running.append({"ts": row["ts"], "running": total})
running
Out[23]:
[{'ts': '00:00', 'running': 5},
{'ts': '00:01', 'running': 8},
{'ts': '00:02', 'running': 8},
{'ts': '00:03', 'running': 10}]
In [24]:
Copied!
rows = [
{"user": "a", "value": 2},
{"user": "a", "value": 3},
{"user": "b", "value": 4},
{"user": "b", "value": 1},
]
windows = []
current_user = None
running = 0
for row in rows:
if current_user is None or row["user"] == current_user:
current_user = row["user"]
running += row["value"]
else:
windows.append({"user": current_user, "sum": running})
current_user = row["user"]
running = row["value"]
windows.append({"user": current_user, "sum": running})
windows
rows = [
{"user": "a", "value": 2},
{"user": "a", "value": 3},
{"user": "b", "value": 4},
{"user": "b", "value": 1},
]
windows = []
current_user = None
running = 0
for row in rows:
if current_user is None or row["user"] == current_user:
current_user = row["user"]
running += row["value"]
else:
windows.append({"user": current_user, "sum": running})
current_user = row["user"]
running = row["value"]
windows.append({"user": current_user, "sum": running})
windows
Out[24]:
[{'user': 'a', 'sum': 5}, {'user': 'b', 'sum': 5}]
Example 22 — Join against side table¶
jq
jq
INDEX($users[];.id) as $U | inputs | . + {user:($U[.user_id]//{})}
In [25]:
Copied!
users_side = [
{"id": 10, "name": "Ada"},
{"id": 11, "name": "Linus"},
]
orders = [
{"order": "o-1", "user_id": 10},
{"order": "o-2", "user_id": 99},
]
directory = {user["id"]: user for user in users_side}
[{**order, "user": directory.get(order["user_id"], {})} for order in orders]
users_side = [
{"id": 10, "name": "Ada"},
{"id": 11, "name": "Linus"},
]
orders = [
{"order": "o-1", "user_id": 10},
{"order": "o-2", "user_id": 99},
]
directory = {user["id"]: user for user in users_side}
[{**order, "user": directory.get(order["user_id"], {})} for order in orders]
Out[25]:
[{'order': 'o-1', 'user_id': 10, 'user': {'id': 10, 'name': 'Ada'}},
{'order': 'o-2', 'user_id': 99, 'user': {}}]
Example 23 — Join and project single field¶
jq
jq
INDEX($u[];.id) as $U | inputs | . + {user_name:($U[.user_id].name // "unknown")}
In [26]:
Copied!
users_side = [
{"id": 10, "name": "Ada"},
{"id": 11, "name": "Linus"},
]
orders = [
{"order": "o-1", "user_id": 10},
{"order": "o-2", "user_id": 12},
]
directory = {user["id"]: user["name"] for user in users_side}
[{**order, "user_name": directory.get(order["user_id"], "unknown")} for order in orders]
users_side = [
{"id": 10, "name": "Ada"},
{"id": 11, "name": "Linus"},
]
orders = [
{"order": "o-1", "user_id": 10},
{"order": "o-2", "user_id": 12},
]
directory = {user["id"]: user["name"] for user in users_side}
[{**order, "user_name": directory.get(order["user_id"], "unknown")} for order in orders]
Out[26]:
[{'order': 'o-1', 'user_id': 10, 'user_name': 'Ada'},
{'order': 'o-2', 'user_id': 12, 'user_name': 'unknown'}]
In [27]:
Copied!
record = {"oldName": "value", "x_version": "1", "other": 10}
renamed = {}
for key, value in record.items():
if key == "oldName":
renamed["newName"] = value
elif key.startswith("x_"):
renamed[key[2:]] = value
else:
renamed[key] = value
renamed
record = {"oldName": "value", "x_version": "1", "other": 10}
renamed = {}
for key, value in record.items():
if key == "oldName":
renamed["newName"] = value
elif key.startswith("x_"):
renamed[key[2:]] = value
else:
renamed[key] = value
renamed
Out[27]:
{'newName': 'value', 'version': '1', 'other': 10}
In [28]:
Copied!
record = {"id": 1, "meta": {"id": "m-1", "other": "x"}}
if "meta" in record and "id" in record["meta"]:
promoted = {**record, "meta_id": record["meta"]["id"]}
promoted["meta"] = {k: v for k, v in record["meta"].items() if k != "id"}
else:
promoted = record
promoted
record = {"id": 1, "meta": {"id": "m-1", "other": "x"}}
if "meta" in record and "id" in record["meta"]:
promoted = {**record, "meta_id": record["meta"]["id"]}
promoted["meta"] = {k: v for k, v in record["meta"].items() if k != "id"}
else:
promoted = record
promoted
Out[28]:
{'id': 1, 'meta': {'other': 'x'}, 'meta_id': 'm-1'}
In [29]:
Copied!
structure = {"a": 1, "b": {"c": 2, "d": [3, 4]}}
leaf_paths(structure)
structure = {"a": 1, "b": {"c": 2, "d": [3, 4]}}
leaf_paths(structure)
Out[29]:
[{'path': ['a'], 'value': 1},
{'path': ['b', 'c'], 'value': 2},
{'path': ['b', 'd', 0], 'value': 3},
{'path': ['b', 'd', 1], 'value': 4}]
In [30]:
Copied!
values_to_coerce = [{"number": "10"}, {"number": "bad"}, {}]
def safe_int(row):
try:
return int(row["number"])
except (KeyError, TypeError, ValueError):
return 0
[safe_int(row) for row in values_to_coerce]
values_to_coerce = [{"number": "10"}, {"number": "bad"}, {}]
def safe_int(row):
try:
return int(row["number"])
except (KeyError, TypeError, ValueError):
return 0
[safe_int(row) for row in values_to_coerce]
Out[30]:
[10, 0, 0]
In [31]:
Copied!
rows = [
{"id": 1, "email": "a@example.com"},
{"id": 2, "email": "bad"},
{"email": "missing"},
]
valid = rows > (where(lambda row: "id" in row and bool(re.match(r".+@.+\..+", row.get("email", "")))) | pb(list))
valid
rows = [
{"id": 1, "email": "a@example.com"},
{"id": 2, "email": "bad"},
{"email": "missing"},
]
valid = rows > (where(lambda row: "id" in row and bool(re.match(r".+@.+\..+", row.get("email", "")))) | pb(list))
valid
Out[31]:
[{'id': 1, 'email': 'a@example.com'}]
Example 29 — Assert invariant¶
jq
jq
. as $o | if ($o.qty // 0)>=0 then $o else error("negative qty") end
In [32]:
Copied!
def enforce_quantity(row):
qty = row.get("qty", 0) or 0
if qty < 0:
raise ValueError("negative qty")
return row
enforce_quantity({"qty": 2}), enforce_quantity({"qty": 0})
def enforce_quantity(row):
qty = row.get("qty", 0) or 0
if qty < 0:
raise ValueError("negative qty")
return row
enforce_quantity({"qty": 2}), enforce_quantity({"qty": 0})
Out[32]:
({'qty': 2}, {'qty': 0})
In [33]:
Copied!
item = {"sku": "sku-1", "qty": 3}
base64.b64encode(json.dumps({"sku": item["sku"], "qty": item["qty"]}, sort_keys=True).encode()).decode()
item = {"sku": "sku-1", "qty": 3}
base64.b64encode(json.dumps({"sku": item["sku"], "qty": item["qty"]}, sort_keys=True).encode()).decode()
Out[33]:
'eyJxdHkiOiAzLCAic2t1IjogInNrdS0xIn0='
In [34]:
Copied!
row = {"q": "data science", "a": "x", "b": "y", "c": "z"}
url = f"https://x/?q={quote_plus(row['q'])}"
csv_line = ",".join([row["a"], row["b"], row["c"]])
{"url": url, "csv": csv_line}
row = {"q": "data science", "a": "x", "b": "y", "c": "z"}
url = f"https://x/?q={quote_plus(row['q'])}"
csv_line = ",".join([row["a"], row["b"], row["c"]])
{"url": url, "csv": csv_line}
Out[34]:
{'url': 'https://x/?q=data+science', 'csv': 'x,y,z'}
In [35]:
Copied!
log_line = "ip=1.1.1.1 status=200 t=45ms"
parsed = {}
for part in log_line.split():
key, value = part.split("=", 1)
parsed[key] = value
{
"ip": parsed.get("ip"),
"status": int(parsed.get("status", 0)),
"ms": int(parsed.get("t", "0ms").replace("ms", "")),
}
log_line = "ip=1.1.1.1 status=200 t=45ms"
parsed = {}
for part in log_line.split():
key, value = part.split("=", 1)
parsed[key] = value
{
"ip": parsed.get("ip"),
"status": int(parsed.get("status", 0)),
"ms": int(parsed.get("t", "0ms").replace("ms", "")),
}
Out[35]:
{'ip': '1.1.1.1', 'status': 200, 'ms': 45}
In [36]:
Copied!
structure = {
"root": [
{"timestamp": "2024-01-01T00:00:00Z"},
{"nested": {"timestamp": "2024-01-02T00:00:00Z"}},
]
}
[match for match in resolve_path(structure, "..timestamp")]
structure = {
"root": [
{"timestamp": "2024-01-01T00:00:00Z"},
{"nested": {"timestamp": "2024-01-02T00:00:00Z"}},
]
}
[match for match in resolve_path(structure, "..timestamp")]
Out[36]:
[]
Example 34 — Cartesian product generator¶
jq
jq
-n '[1,2,3] as $a | ["x","y"] as $b | $a[] as $i | $b[] as $j | {i:$i,j:$j}'
In [37]:
Copied!
[{"i": i, "j": j} for i, j in product([1, 2, 3], ["x", "y"])]
[{"i": i, "j": j} for i, j in product([1, 2, 3], ["x", "y"])]
Out[37]:
[{'i': 1, 'j': 'x'},
{'i': 1, 'j': 'y'},
{'i': 2, 'j': 'x'},
{'i': 2, 'j': 'y'},
{'i': 3, 'j': 'x'},
{'i': 3, 'j': 'y'}]
Example 35 — Params from shell args¶
jq
jq
-n --arg user "$USER" --argjson cfg '{"k":1}' '{run_by:$user, cfg:$cfg}'
In [38]:
Copied!
user = "ada"
cfg = {"k": 1}
{"run_by": user, "cfg": cfg}
user = "ada"
cfg = {"k": 1}
{"run_by": user, "cfg": cfg}
Out[38]:
{'run_by': 'ada', 'cfg': {'k': 1}}
In [39]:
Copied!
row = {"id": 100, "a": 2, "b": 3}
{"id": row["id"], "extended": row["a"] + row["b"]}
row = {"id": 100, "a": 2, "b": 3}
{"id": row["id"], "extended": row["a"] + row["b"]}
Out[39]:
{'id': 100, 'extended': 5}
Example 37 — Switch by type¶
jq
jq
type as $t | {type:$t, value:(if $t=="number" then . * 2 elif $t=="string" then .+"!" else . end)}
In [40]:
Copied!
def transform_value(value):
kind = type(value)
if kind in (int, float):
return {"type": "number", "value": value * 2}
if kind is str:
return {"type": "string", "value": value + "!"}
return {"type": kind.__name__, "value": value}
[transform_value(v) for v in [10, "hi", [1, 2]]]
def transform_value(value):
kind = type(value)
if kind in (int, float):
return {"type": "number", "value": value * 2}
if kind is str:
return {"type": "string", "value": value + "!"}
return {"type": kind.__name__, "value": value}
[transform_value(v) for v in [10, "hi", [1, 2]]]
Out[40]:
[{'type': 'number', 'value': 20},
{'type': 'string', 'value': 'hi!'},
{'type': 'list', 'value': [1, 2]}]
Example 38 — Stringify non-ASCII¶
jq
jq
walk(if type=="string" and (.[0:]|test("[^\u0000-\u007F]")) then @json else . end)
In [41]:
Copied!
strings = {"english": "hello", "japanese": "こんにちは"}
stringify_non_ascii(strings)
strings = {"english": "hello", "japanese": "こんにちは"}
stringify_non_ascii(strings)
Out[41]:
{'english': 'hello', 'japanese': '"\\u3053\\u3093\\u306b\\u3061\\u306f"'}
Example 39 — Mask secrets¶
jq
jq
walk(if type=="object" then with_entries(if (.key|test("(?i)secret|token|password")) then .value="***" else . end) else . end)
In [42]:
Copied!
secrets = {"token": "abcd", "nested": {"apiSecret": "123", "visible": "ok"}}
mask_secret_like(secrets)
secrets = {"token": "abcd", "nested": {"apiSecret": "123", "visible": "ok"}}
mask_secret_like(secrets)
Out[42]:
{'token': '***', 'nested': {'apiSecret': '***', 'visible': 'ok'}}
In [43]:
Copied!
record = {"id": 1, "user": "ada", "total": 42.5}
",".join([str(record["id"]), record["user"], str(record["total"])])
record = {"id": 1, "user": "ada", "total": 42.5}
",".join([str(record["id"]), record["user"], str(record["total"])])
Out[43]:
'1,ada,42.5'
In [44]:
Copied!
order = {"id": 1, "meta": {"version": "v1"}}
" ".join(
[
str(order.get("id", "")),
order.get("name", ""),
order.get("meta", {}).get("version", ""),
]
)
order = {"id": 1, "meta": {"version": "v1"}}
" ".join(
[
str(order.get("id", "")),
order.get("name", ""),
order.get("meta", {}).get("version", ""),
]
)
Out[44]:
'1\t\tv1'
Example 42 — Propagate parent field to all nested objects¶
jq
jq
.name as $n | .. | objects | . + {parent_name:$n}
In [45]:
Copied!
record = {"name": "root", "children": [{"value": 1}, {"value": 2}]}
[{**child, "parent_name": record["name"]} for child in record["children"]]
record = {"name": "root", "children": [{"value": 1}, {"value": 2}]}
[{**child, "parent_name": record["name"]} for child in record["children"]]
Out[45]:
[{'value': 1, 'parent_name': 'root'}, {'value': 2, 'parent_name': 'root'}]
Example 43 — Conditionally explode arrays¶
jq
jq
if (.items?|type)=="array" then .items[] | . + {parent:.name} else . end
In [46]:
Copied!
rows = [
{"name": "alpha", "items": [{"id": 1}, {"id": 2}]},
{"name": "beta", "items": None},
]
result = []
for row in rows:
items = row.get("items")
if isinstance(items, list):
result.extend({**item, "parent": row["name"]} for item in items)
else:
result.append(row)
result
rows = [
{"name": "alpha", "items": [{"id": 1}, {"id": 2}]},
{"name": "beta", "items": None},
]
result = []
for row in rows:
items = row.get("items")
if isinstance(items, list):
result.extend({**item, "parent": row["name"]} for item in items)
else:
result.append(row)
result
Out[46]:
[{'id': 1, 'parent': 'alpha'},
{'id': 2, 'parent': 'alpha'},
{'name': 'beta', 'items': None}]