# Read the CSV file, properly handling multiline fields
dataset_id = project_id + '.' + raw_dataset_id
client = bigquery.Client(credentials=credentials, project=project_id)
csv_file_path = "../embedded_data.csv"
df = pd.read_csv(csv_file_path, engine='python', quotechar='"', quoting=1)
# Preprocess the data to ensure content_vector is correctly formatted
# removing last and first character which are brackets [], comma splitting and converting to float
def preprocess_content_vector(row):
row['content_vector'] = [float(x) for x in row['content_vector'][1:-1].split(',')]
return row
# Apply preprocessing to the dataframe
df = df.apply(preprocess_content_vector, axis=1)
# Define the schema of the final table
final_schema = [
bigquery.SchemaField("id", "STRING"),
bigquery.SchemaField("vector_id", "STRING"),
bigquery.SchemaField("title", "STRING"),
bigquery.SchemaField("text", "STRING"),
bigquery.SchemaField("title_vector", "STRING"),
bigquery.SchemaField("content_vector", "FLOAT64", mode="REPEATED"),
bigquery.SchemaField("category", "STRING"),
]
# Define the final table ID
raw_table_id = 'embedded_data'
final_table_id = f'{dataset_id}.' + raw_table_id
# Create the final table object
final_table = bigquery.Table(final_table_id, schema=final_schema)
# Send the table to the API for creation
final_table = client.create_table(final_table, exists_ok=True) # API request
print(f"Created final table {project_id}.{final_table.dataset_id}.{final_table.table_id}")
# Convert DataFrame to list of dictionaries for BigQuery insertion
rows_to_insert = df.to_dict(orient='records')
# Upload data to the final table
errors = client.insert_rows_json(f"{final_table.dataset_id}.{final_table.table_id}", rows_to_insert) # API request
if errors:
print(f"Encountered errors while inserting rows: {errors}")
else:
print(f"Successfully loaded data into {dataset_id}:{final_table_id}")