# We'll need to install the Weaviate client!pip install weaviate-client#Install wget to pull zip file!pip install wget
import openaifrom typing import List, Iteratorimport pandas as pdimport numpy as npimport osimport wgetfrom ast import literal_eval# Weaviate's client library for Pythonimport weaviate# I've set this to our new embeddings model, this can be changed to the embedding model of your choiceEMBEDDING_MODEL="text-embedding-3-small"# Ignore unclosed SSL socket warnings - optional in case you get these errorsimport warningswarnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning)warnings.filterwarnings("ignore", category=DeprecationWarning)
embeddings_url ='https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'# The file is ~700 MB so this will take some timewget.download(embeddings_url)
import zipfilewith zipfile.ZipFile("vector_database_wikipedia_articles_embedded.zip","r") as zip_ref: zip_ref.extractall("../data")
# Read vectors from strings back into a listarticle_df['title_vector'] = article_df.title_vector.apply(literal_eval)article_df['content_vector'] = article_df.content_vector.apply(literal_eval)# Set vector_id to be a stringarticle_df['vector_id'] = article_df['vector_id'].apply(str)
# Clear up the schema, so that we can recreate itclient.schema.delete_all()client.schema.get()# Define the Schema object to use `text-embedding-3-small` on `title` and `content`, but skip it for `url`article_schema = {"class": "Article","description": "A collection of articles","vectorizer": "text2vec-openai","moduleConfig": {"text2vec-openai": {"model": "ada","modelVersion": "002","type": "text" } },"properties": [{"name": "title","description": "Title of the article","dataType": ["string"] }, {"name": "content","description": "Contents of the article","dataType": ["text"],"moduleConfig": { "text2vec-openai": { "skip": True } } }]}# add the Article schemaclient.schema.create_class(article_schema)# get the schema to make sure it workedclient.schema.get()
# Test that all data has loaded – get object countresult = ( client.query.aggregate("Article") .with_fields("meta { count }") .do())print("Object count: ", result["data"]["Aggregate"]["Article"])
Object count: [{'meta': {'count': 25000}}]
# Test one article has worked by checking one objecttest_article = ( client.query .get("Article", ["title", "content", "_additional {id}"]) .with_limit(1) .do())["data"]["Get"]["Article"][0]print(test_article["_additional"]["id"])print(test_article["title"])print(test_article["content"])
000393f2-1182-4e3d-abcf-4217eda64be0
Lago d'Origlio
Lago d'Origlio is a lake in the municipality of Origlio, in Ticino, Switzerland.
Lakes of Ticino
defquery_weaviate(query, collection_name, top_k=20):# Creates embedding vector from user query embedded_query = openai.Embedding.create(input=query,model=EMBEDDING_MODEL, )["data"][0]['embedding'] near_vector = {"vector": embedded_query}# Queries input schema with vectorised user query query_result = ( client.query .get(collection_name, ["title", "content", "_additional {certainty distance}"]) .with_near_vector(near_vector) .with_limit(top_k) .do() )return query_result
query_result = query_weaviate("modern art in Europe", "Article")counter =0for article in query_result["data"]["Get"]["Article"]: counter +=1print(f"{counter}. { article['title']} (Certainty: {round(article['_additional']['certainty'],3) }) (Distance: {round(article['_additional']['distance'],3) })")
1. Museum of Modern Art (Certainty: 0.938) (Distance: 0.125)
2. Western Europe (Certainty: 0.934) (Distance: 0.133)
3. Renaissance art (Certainty: 0.932) (Distance: 0.136)
4. Pop art (Certainty: 0.93) (Distance: 0.14)
5. Northern Europe (Certainty: 0.927) (Distance: 0.145)
6. Hellenistic art (Certainty: 0.926) (Distance: 0.147)
7. Modernist literature (Certainty: 0.924) (Distance: 0.153)
8. Art film (Certainty: 0.922) (Distance: 0.157)
9. Central Europe (Certainty: 0.921) (Distance: 0.157)
10. European (Certainty: 0.921) (Distance: 0.159)
11. Art (Certainty: 0.921) (Distance: 0.159)
12. Byzantine art (Certainty: 0.92) (Distance: 0.159)
13. Postmodernism (Certainty: 0.92) (Distance: 0.16)
14. Eastern Europe (Certainty: 0.92) (Distance: 0.161)
15. Europe (Certainty: 0.919) (Distance: 0.161)
16. Cubism (Certainty: 0.919) (Distance: 0.161)
17. Impressionism (Certainty: 0.919) (Distance: 0.162)
18. Bauhaus (Certainty: 0.919) (Distance: 0.162)
19. Expressionism (Certainty: 0.918) (Distance: 0.163)
20. Surrealism (Certainty: 0.918) (Distance: 0.163)
query_result = query_weaviate("Famous battles in Scottish history", "Article")counter =0for article in query_result["data"]["Get"]["Article"]: counter +=1print(f"{counter}. {article['title']} (Score: {round(article['_additional']['certainty'],3) })")
1. Historic Scotland (Score: 0.946)
2. First War of Scottish Independence (Score: 0.946)
3. Battle of Bannockburn (Score: 0.946)
4. Wars of Scottish Independence (Score: 0.944)
5. Second War of Scottish Independence (Score: 0.94)
6. List of Scottish monarchs (Score: 0.937)
7. Scottish Borders (Score: 0.932)
8. Braveheart (Score: 0.929)
9. John of Scotland (Score: 0.929)
10. Guardians of Scotland (Score: 0.926)
11. Holyrood Abbey (Score: 0.925)
12. Scottish (Score: 0.925)
13. Scots (Score: 0.925)
14. Robert I of Scotland (Score: 0.924)
15. Scottish people (Score: 0.924)
16. Edinburgh Castle (Score: 0.924)
17. Alexander I of Scotland (Score: 0.924)
18. Robert Burns (Score: 0.924)
19. Battle of Bosworth Field (Score: 0.922)
20. David II of Scotland (Score: 0.922)