Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: to_gbq uses default_type for ambiguous array types and struct field types #838

Merged
merged 20 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
5c5a04b
fix: `to_gbq` uses `default_type` for ambiguous array types and struc…
tswast Dec 12, 2024
30c2d0c
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Dec 12, 2024
fa6907b
fix arrow list(null) case too
tswast Dec 12, 2024
d9b0a10
Merge remote-tracking branch 'origin/issue836-to_gbq-with-schema' int…
tswast Dec 12, 2024
b5ccce2
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Dec 12, 2024
77d8e9f
lint
tswast Dec 12, 2024
c289eb4
Merge remote-tracking branch 'origin/main' into issue836-to_gbq-with-…
tswast Dec 12, 2024
58a0e54
Merge remote-tracking branch 'origin/issue836-to_gbq-with-schema' int…
tswast Dec 12, 2024
ae17ea4
Update pandas_gbq/schema/pandas_to_bigquery.py
tswast Dec 16, 2024
45f1df2
Update pandas_gbq/schema/pandas_to_bigquery.py
tswast Dec 16, 2024
286fefd
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Dec 16, 2024
619ec27
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Dec 16, 2024
e1d486c
Merge branch 'issue836-to_gbq-with-schema' of https://github.com/goog…
gcf-owl-bot[bot] Dec 16, 2024
d79e20d
Merge remote-tracking branch 'origin/main' into issue836-to_gbq-with-…
tswast Dec 18, 2024
1e48053
remove redundant string check
tswast Dec 19, 2024
854e9c4
Apply suggestions from code review
tswast Dec 19, 2024
c20449b
Merge remote-tracking branch 'origin/issue836-to_gbq-with-schema' int…
tswast Dec 19, 2024
82ac6db
add docstrings and a few more test cases
tswast Dec 19, 2024
a2dc91e
use python 3.10 for docs github action
tswast Dec 19, 2024
7e23e74
exclude docs from owlbot
tswast Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 22 additions & 9 deletions pandas_gbq/schema/pandas_to_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def dataframe_to_bigquery_fields(

# Try to automatically determine the type based on a few rows of the data.
values = dataframe.reset_index()[column]
bq_field = values_to_bigquery_field(column, values)
bq_field = values_to_bigquery_field(column, values, default_type=default_type)

if bq_field:
bq_schema_out.append(bq_field)
Expand All @@ -114,7 +114,9 @@ def dataframe_to_bigquery_fields(
arrow_value = pyarrow.array(values)
bq_field = (
pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field(
column, arrow_value.type
column,
arrow_value.type,
default_type=default_type,
)
)

Expand Down Expand Up @@ -164,7 +166,14 @@ def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]:
return None


def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]:
def value_to_bigquery_field(
name, value, default_type=None
) -> Optional[schema.SchemaField]:
tswast marked this conversation as resolved.
Show resolved Hide resolved
# Set the SchemaField datatype to the given default_type if the value
tswast marked this conversation as resolved.
Show resolved Hide resolved
# being assessed is None.
if value is None:
return schema.SchemaField(name, default_type)

if isinstance(value, str):
return schema.SchemaField(name, "STRING")

Expand All @@ -188,29 +197,33 @@ def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]:
return None


def values_to_bigquery_field(name, values) -> Optional[schema.SchemaField]:
def values_to_bigquery_field(
name, values, default_type="STRING"
) -> Optional[schema.SchemaField]:
tswast marked this conversation as resolved.
Show resolved Hide resolved
value = pandas_gbq.core.pandas.first_valid(values)

# All NULL, type not determinable.
# All values came back as NULL, thus type not determinable by this method.
tswast marked this conversation as resolved.
Show resolved Hide resolved
# Return None so we can try other methods.
if value is None:
return None

field = value_to_bigquery_field(name, value)
field = value_to_bigquery_field(name, value, default_type=default_type)
if field is not None:
return field

if isinstance(value, str):
return schema.SchemaField(name, "STRING")

# Check plain ARRAY values here. Let STRUCT get determined by pyarrow,
# which can examine more values to determine all keys.
# Check plain ARRAY values here. Exclude mapping types to let STRUCT get
# determined by pyarrow, which can examine more values to determine all
# keys.
if isinstance(value, collections.abc.Iterable) and not isinstance(
value, collections.abc.Mapping
):
# It could be that this value contains all None or is empty, so get the
# first non-None value we can find.
valid_item = pandas_gbq.core.pandas.first_array_valid(values)
field = value_to_bigquery_field(name, valid_item)
field = value_to_bigquery_field(name, valid_item, default_type=default_type)

if field is not None:
return schema.SchemaField(name, field.field_type, mode="REPEATED")
Expand Down
26 changes: 21 additions & 5 deletions pandas_gbq/schema/pyarrow_to_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,15 @@
}


def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]:
def arrow_type_to_bigquery_field(
tswast marked this conversation as resolved.
Show resolved Hide resolved
name, type_, default_type="STRING"
) -> Optional[schema.SchemaField]:
# If a sub-field is the null type, then assume it's the default type, as
# that's the best we can do.
# https://github.com/googleapis/python-bigquery-pandas/issues/836
if pyarrow.types.is_null(type_):
return schema.SchemaField(name, default_type)

# Since both TIMESTAMP/DATETIME use pyarrow.timestamp(...), we need to use
# a special case to disambiguate them. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/450
Expand All @@ -52,22 +60,30 @@ def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]:
return schema.SchemaField(name, detected_type)

if pyarrow.types.is_list(type_):
return arrow_list_type_to_bigquery(name, type_)
return arrow_list_type_to_bigquery(name, type_, default_type=default_type)

if pyarrow.types.is_struct(type_):
inner_fields: list[pyarrow.Field] = []
struct_type = cast(pyarrow.StructType, type_)
for field_index in range(struct_type.num_fields):
field = struct_type[field_index]
inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type))
inner_fields.append(
arrow_type_to_bigquery_field(
field.name, field.type, default_type=default_type
)
)

return schema.SchemaField(name, "RECORD", fields=inner_fields)

return None


def arrow_list_type_to_bigquery(name, type_) -> Optional[schema.SchemaField]:
inner_field = arrow_type_to_bigquery_field(name, type_.value_type)
def arrow_list_type_to_bigquery(
name, type_, default_type="STRING"
) -> Optional[schema.SchemaField]:
inner_field = arrow_type_to_bigquery_field(
name, type_.value_type, default_type=default_type
)
if inner_field is None:
return None

Expand Down
18 changes: 8 additions & 10 deletions tests/unit/schema/test_pyarrow_to_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,14 @@ def test_arrow_type_to_bigquery_field_scalar_types(pyarrow_type, bigquery_type):


def test_arrow_type_to_bigquery_field_unknown():
assert (
pyarrow_to_bigquery.arrow_type_to_bigquery_field("test_name", pyarrow.null())
is None
)
assert pyarrow_to_bigquery.arrow_type_to_bigquery_field(
"test_name", pyarrow.null(), default_type="DEFAULT_TYPE"
) == bigquery.SchemaField("test_name", "DEFAULT_TYPE")


def test_arrow_type_to_bigquery_field_list_of_unknown():
assert (
pyarrow_to_bigquery.arrow_type_to_bigquery_field(
"test_name", pyarrow.list_(pyarrow.null())
)
is None
)
assert pyarrow_to_bigquery.arrow_type_to_bigquery_field(
"test_name",
pyarrow.list_(pyarrow.null()),
default_type="DEFAULT_TYPE",
) == bigquery.SchemaField("test_name", "DEFAULT_TYPE", mode="REPEATED")
51 changes: 46 additions & 5 deletions tests/unit/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_schema_is_subset_fails_if_not_subset():
[
pytest.param(
pandas.DataFrame(data={"col1": [object()]}),
{"fields": [{"name": "col1", "type": "STRING"}]},
{"fields": [{"name": "col1", "type": "DEFAULT_TYPE"}]},
id="default-type-fails-pyarrow-conversion",
),
(
Expand Down Expand Up @@ -182,13 +182,15 @@ def test_schema_is_subset_fails_if_not_subset():
else "object",
),
"list_of_struct": pandas.Series(
[[], [{"test": "abc"}], []],
[[], [{"test": 123.0}], []],
dtype=pandas.ArrowDtype(
pyarrow.list_(pyarrow.struct([("test", pyarrow.string())]))
pyarrow.list_(pyarrow.struct([("test", pyarrow.float64())]))
)
if hasattr(pandas, "ArrowDtype")
else "object",
),
"list_of_unknown": [[], [], []],
"list_of_null": [[None, None], [None], [None, None]],
}
),
{
Expand All @@ -200,17 +202,56 @@ def test_schema_is_subset_fails_if_not_subset():
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{"name": "test", "type": "STRING", "mode": "NULLABLE"},
{"name": "test", "type": "FLOAT", "mode": "NULLABLE"},
],
},
# Use DEFAULT_TYPE because there are no values to detect a type.
{
"name": "list_of_unknown",
"type": "DEFAULT_TYPE",
"mode": "REPEATED",
},
{
"name": "list_of_null",
"type": "DEFAULT_TYPE",
"mode": "REPEATED",
},
],
},
id="array",
),
pytest.param(
# If a struct contains only nulls in a sub-field, use the default
# type for subfields without a type we can determine.
# https://github.com/googleapis/python-bigquery-pandas/issues/836
pandas.DataFrame(
{
"id": [0, 1],
"positions": [{"state": None}, {"state": None}],
},
),
{
"fields": [
{"name": "id", "type": "INTEGER"},
{
"name": "positions",
"type": "RECORD",
"fields": [
{
"name": "state",
"type": "DEFAULT_TYPE",
"mode": "NULLABLE",
},
],
},
],
},
id="issue832-null-struct-field",
),
],
)
def test_generate_bq_schema(dataframe, expected_schema):
schema = pandas_gbq.gbq._generate_bq_schema(dataframe)
schema = pandas_gbq.gbq._generate_bq_schema(dataframe, default_type="DEFAULT_TYPE")

# NULLABLE is the default mode.
for field in expected_schema["fields"]:
Expand Down
Loading