Berri.ai Hackathon
1. Try to auto link zettelkasten notes
1.1. Create chunks of data to use as data source for the berry endpoint
import os text_files = [] for root, dir, files in os.walk("/home/chahak/Documents/chahak13.github.io/org/blog/"): for file in files: if file.endswith(".txt"): with open(file, 'r') as f: print(file) text_files.append({"chunk_metadata": file, "chunk": f.read()})
export_source_code_of_tex_file.txt autoreload_with_ipython.txt change_font_size_in_emacs.txt attractors_using_datashader.txt config_class_using_chainmaps.txt chunky_pandas_read_csv_in_chunks.txt attractors_examples_0_1_0_documentation.txt audio_on_linux.txt bug_polar_plot_wrong_y_values_when_set_ylim_is_used_issue_24790_matplotlib_matplotlib.txt where_are_matplotlib_rcparams_used.txt does_plt_scatter_work_with_masked_offsets.txt chainmaps_in_python.txt emacsconf_2022_talks_emacs_journalism_or_everything_s_a_nail_if_you_hit_it_with_emacs.txt 1_how_scientists_colorize_photos_of_space_youtube.txt bug_contour_raises_indexerror_if_z_is_specified_as_keyword_argument_issue_24743_matplotlib_matplotlib.txt click_command_line_interfaces:_make_options_required_if_other_optional_option_is_unset.txt aaronpenne_generative_art_a_collection_of_my_generative_artwork_mostly_with_processing_in_python_mode.txt 12_decorator_and_descriptors_advance_python_tutorials_documentation.txt
1.2. Setting up Berri.ai endpoint
import json import requests url = "https://api.berri.ai/create_app" data = {"user_email": "cpmdump@gmail.com", "data_source": json.dumps(text_files)} instance_response = requests.post(url, data=data) playground_endpoint = instance_response.json()["playground_endpoint"] print(playground_endpoint)
play.berri.ai/aHR0cHM6Ly9zdG9yZXF1ZXJ5YWJoaTItYXlsdS56ZWV0LWJlcnJpLnplZXQuYXBwL2JlcnJpX3F1ZXJ5P3Byb2pfcGF0aD1pbmRleGVzL2NwbWR1bXBAZ21haWwuY29tL2E3MGZhZmYwLWJmMjAtNDU2NS1hMTgxLTRhMTg5ZTRjNDUwNSZwcm9qX25hbWU9U3RyYXdiZXJyeSBQcm9qZWN0JnF1ZXJ5PQ==
1.3. Playground tries
1.3.1. Try getting relevant notes simply.
from pprint import pprint user_email = instance_response.json()["account_email"] instance_id = instance_response.json()["instance_id"] model = "gpt-3.5-turbo" query_api = "https://api.berri.ai/query" query = "which chunks are related to python?" query_params = { "user_email": user_email, "instance_id": instance_id, "query": query, "model": model, "top_k": 5, } response = requests.get(query_api, params=query_params) pprint(response.json())
1.3.2. Use createtemplate and createapp
We first create a template for the instance using the create_template api.
import requests import json template_api = "https://api.berri.ai/create_template" prompt = """ Generate the response as a JSON object of related notes. For example, Question: What notes are related to xyz? Response: { "query": The user question, "related_notes": ["note1", "note2", "note3"], } """ app_config = { "advanced": { "intent": "qa_doc", "search": "qa_gen", "app_type": "complex", }, # "prompt": prompt, } data = {"app_config": json.dumps(app_config)} template_response = requests.post(template_api, data=data) print(template_response.json())
{'app_config': {'advanced': {'app_type': 'complex', 'intent': 'qa_doc', 'search': 'qa_gen'}}, 'template_id': 'b037c822-2035-4ce5-9f27-37388e6a0071'}
Now that we have a template, we will use it to create an instance using the create_app api.
template_id = template_response.json()["template_id"] app_api = "https://api.berri.ai/create_app" data = {"template_id": template_id, "user_email": "cpmdump@gmail.com", "data_source": json.dumps(text_files)} app_response = requests.post(app_api, data=data) print(app_response.json()) app_endpoint = app_response.json()["api_endpoint"]
{'account_email': 'cpmdump@gmail.com', 'api_endpoint': 'https://api.berri.ai/query?user_email=cpmdump@gmail.com&instance_id=e4e9556e-7337-4bab-a48d-377954985374&agent_type=complex_support', 'instance_id': 'e4e9556e-7337-4bab-a48d-377954985374', 'playground_endpoint': 'play.berri.ai/aHR0cHM6Ly9zdG9yZXF1ZXJ5YWJoaTItYXlsdS56ZWV0LWJlcnJpLnplZXQuYXBwL2JlcnJpX3F1ZXJ5P3Byb2pfcGF0aD1pbmRleGVzL2NwbWR1bXBAZ21haWwuY29tL2U0ZTk1NTZlLTczMzctNGJhYi1hNDhkLTM3Nzk1NDk4NTM3NCZwcm9qX25hbWU9ZGF0YV9saXN0JmFnZW50X3R5cGU9Y29tcGxleF9zdXBwb3J0JnF1ZXJ5PQ==', 'website_endpoint': 'chat.berri.ai/aHR0cHM6Ly9zdG9yZXF1ZXJ5YWJoaTItYXlsdS56ZWV0LWJlcnJpLnplZXQuYXBwL2JlcnJpX3F1ZXJ5P3Byb2pfcGF0aD1pbmRleGVzL2NwbWR1bXBAZ21haWwuY29tL2U0ZTk1NTZlLTczMzctNGJhYi1hNDhkLTM3Nzk1NDk4NTM3NCZwcm9qX25hbWU9ZGF0YV9saXN0JmFnZW50X3R5cGU9Y29tcGxleF9zdXBwb3J0JnF1ZXJ5PQ=='}
This creates an instance that should have the summaries embedded too, which can help relate notes?
from pprint import pprint user_email = app_response.json()["account_email"] instance_id = app_response.json()["instance_id"] model = "gpt-3.5-turbo" query_api = "https://api.berri.ai/query" query = "which notes are related to python?" query_params = { "user_email": user_email, "instance_id": instance_id, "query": query, "model": model, "top_k": 10, } response = requests.get(query_api, params=query_params) for reference in response.json()["references"]: print(reference["doc_metadata"], reference["similarity"])
autoreload_with_ipython.txt 0.772821150576018 bug_polar_plot_wrong_y_values_when_set_ylim_is_used_issue_24790_matplotlib_matplotlib.txt 0.76837971064003 where_are_matplotlib_rcparams_used.txt 0.767879123880019
response.json()["response"]
All of the notes in this context are related to Python.
1.3.3. Generate summaries of notes individually
import json import requests summaries = {} for note in text_files: print(f"Summarizing {note['chunk_metadata']}") url = "https://api.berri.ai/create_app" data = {"user_email": "cpmdump@gmail.com", "data_source": json.dumps([note])} app_response = requests.post(url, data=data) if not app_response.ok: print(app_response.text) break query_api = "https://api.berri.ai/query" query_params = { "user_email": app_response.json()["account_email"], "instance_id": app_response.json()["instance_id"], "query": f"Summarize the note {note['chunk_metadata']}", "model": "gpt-3.5-turbo", } response = requests.get(query_api, params=query_params) summaries[note["chunk_metadata"]] = response.json()["response"]
Summarizing export_source_code_of_tex_file.txt Summarizing autoreload_with_ipython.txt Summarizing change_font_size_in_emacs.txt Summarizing attractors_using_datashader.txt Summarizing config_class_using_chainmaps.txt Summarizing chunky_pandas_read_csv_in_chunks.txt Summarizing attractors_examples_0_1_0_documentation.txt Summarizing audio_on_linux.txt Summarizing bug_polar_plot_wrong_y_values_when_set_ylim_is_used_issue_24790_matplotlib_matplotlib.txt Summarizing where_are_matplotlib_rcparams_used.txt Summarizing does_plt_scatter_work_with_masked_offsets.txt Summarizing chainmaps_in_python.txt Summarizing emacsconf_2022_talks_emacs_journalism_or_everything_s_a_nail_if_you_hit_it_with_emacs.txt Summarizing 1_how_scientists_colorize_photos_of_space_youtube.txt Summarizing bug_contour_raises_indexerror_if_z_is_specified_as_keyword_argument_issue_24743_matplotlib_matplotlib.txt Summarizing click_command_line_interfaces:_make_options_required_if_other_optional_option_is_unset.txt Summarizing aaronpenne_generative_art_a_collection_of_my_generative_artwork_mostly_with_processing_in_python_mode.txt Summarizing 12_decorator_and_descriptors_advance_python_tutorials_documentation.txt
What if we put these summaries as chunks to get related stuff?
import json import requests data_dump = json.dumps([{"chunk_metadata": name, "chunk": summary} for name, summary in summaries.items()]) url = "https://api.berri.ai/create_app" data = {"user_email": "cpmdump@gmail.com", "data_source": data_dump} instance_response = requests.post(url, data=data) playground_endpoint = instance_response.json()["playground_endpoint"] print(playground_endpoint)
from tqdm import tqdm related_queries = {} for querynote in tqdm(summaries): query_api = "https://api.berri.ai/query" # querynote = "bug_polar_plot_wrong_y_values_when_set_ylim_is_used_issue_24790_matplotlib_matplotlib.txt" # querynote = "bug_contour_raises_indexerror_if_z_is_specified_as_keyword_argument_issue_24743_matplotlib_matplotlib.txt" query_params = { "user_email": instance_response.json()["account_email"], "instance_id": instance_response.json()["instance_id"], "query": f"Which note other than {querynote} talks about similar topics as {querynote}", # "query": f"What topics keywords are covered in {querynote}", "model": "gpt-3.5-turbo", } response = requests.get(query_api, params=query_params) related_queries[querynote] = [(x["doc_metadata"], x["similarity"]) for x in response.json()["references"]]
100% 18/18 [00:33<00:00, 1.86s/it]
pprint(related_queries)
{'12_decorator_and_descriptors_advance_python_tutorials_documentation.txt': [('12_decorator_and_descriptors_advance_python_tutorials_documentation.txt',
                                                                              0.801491485346171),
                                                                             ('bug_contour_raises_indexerror_if_z_is_specified_as_keyword_argument_issue_24743_matplotlib_matplotlib.txt',
                                                                              0.769774133145604)],
 '1_how_scientists_colorize_photos_of_space_youtube.txt': [('1_how_scientists_colorize_photos_of_space_youtube.txt',
                                                            0.866898380621366),
                                                           ('aaronpenne_generative_art_a_collection_of_my_generative_artwork_mostly_with_processing_in_python_mode.txt',
                                                            0.787544429009431)],
 'aaronpenne_generative_art_a_collection_of_my_generative_artwork_mostly_with_processing_in_python_mode.txt': [('aaronpenne_generative_art_a_collection_of_my_generative_artwork_mostly_with_processing_in_python_mode.txt',
                                                                                                                0.88111354230034),
                                                                                                               ('emacsconf_2022_talks_emacs_journalism_or_everything_s_a_nail_if_you_hit_it_with_emacs.txt',
                                                                                                                0.780368471118458)],
 'attractors_examples_0_1_0_documentation.txt': [('attractors_examples_0_1_0_documentation.txt',
                                                  0.849884469214183),
                                                 ('attractors_using_datashader.txt',
                                                  0.799727094675026)],
 'attractors_using_datashader.txt': [('attractors_using_datashader.txt',
                                      0.849356923206236),
                                     ('attractors_examples_0_1_0_documentation.txt',
                                      0.841316289272838)],
 'audio_on_linux.txt': [('audio_on_linux.txt', 0.793566176368703),
                        ('click_command_line_interfaces:_make_options_required_if_other_optional_option_is_unset.txt',
                         0.760427890909493)],
 'autoreload_with_ipython.txt': [('autoreload_with_ipython.txt',
                                  0.862660274256445),
                                 ('chunky_pandas_read_csv_in_chunks.txt',
                                  0.773916791555096)],
 'bug_contour_raises_indexerror_if_z_is_specified_as_keyword_argument_issue_24743_matplotlib_matplotlib.txt': [('bug_contour_raises_indexerror_if_z_is_specified_as_keyword_argument_issue_24743_matplotlib_matplotlib.txt',
                                                                                                                0.927062315641177),
                                                                                                               ('bug_polar_plot_wrong_y_values_when_set_ylim_is_used_issue_24790_matplotlib_matplotlib.txt',
                                                                                                                0.818052487673946)],
 'bug_polar_plot_wrong_y_values_when_set_ylim_is_used_issue_24790_matplotlib_matplotlib.txt': [('bug_polar_plot_wrong_y_values_when_set_ylim_is_used_issue_24790_matplotlib_matplotlib.txt',
                                                                                                0.88649582204242),
                                                                                               ('bug_contour_raises_indexerror_if_z_is_specified_as_keyword_argument_issue_24743_matplotlib_matplotlib.txt',
                                                                                                0.850670934887906)],
 'chainmaps_in_python.txt': [('chainmaps_in_python.txt', 0.838342616347961),
                             ('config_class_using_chainmaps.txt',
                              0.807442005228644)],
 'change_font_size_in_emacs.txt': [('change_font_size_in_emacs.txt',
                                    0.851839845742051),
                                   ('emacsconf_2022_talks_emacs_journalism_or_everything_s_a_nail_if_you_hit_it_with_emacs.txt',
                                    0.789396265311118)],
 'chunky_pandas_read_csv_in_chunks.txt': [('chunky_pandas_read_csv_in_chunks.txt',
                                           0.845295000355032),
                                          ('where_are_matplotlib_rcparams_used.txt',
                                           0.763790937761751)],
 'click_command_line_interfaces:_make_options_required_if_other_optional_option_is_unset.txt': [('click_command_line_interfaces:_make_options_required_if_other_optional_option_is_unset.txt',
                                                                                                 0.923219390051759),
                                                                                                ('where_are_matplotlib_rcparams_used.txt',
                                                                                                 0.760569829934478)],
 'config_class_using_chainmaps.txt': [('config_class_using_chainmaps.txt',
                                       0.839832113622164),
                                      ('chainmaps_in_python.txt',
                                       0.783611604698386)],
 'does_plt_scatter_work_with_masked_offsets.txt': [('does_plt_scatter_work_with_masked_offsets.txt',
                                                    0.845049315280412),
                                                   ('where_are_matplotlib_rcparams_used.txt',
                                                    0.773232239436078)],
 'emacsconf_2022_talks_emacs_journalism_or_everything_s_a_nail_if_you_hit_it_with_emacs.txt': [('emacsconf_2022_talks_emacs_journalism_or_everything_s_a_nail_if_you_hit_it_with_emacs.txt',
                                                                                                0.839243988602219),
                                                                                               ('change_font_size_in_emacs.txt',
                                                                                                0.794385513400397)],
 'export_source_code_of_tex_file.txt': [('export_source_code_of_tex_file.txt',
                                         0.857983976417399),
                                        ('where_are_matplotlib_rcparams_used.txt',
                                         0.780384919535461)],
 'where_are_matplotlib_rcparams_used.txt': [('where_are_matplotlib_rcparams_used.txt',
                                             0.852354787774178),
                                            ('bug_contour_raises_indexerror_if_z_is_specified_as_keyword_argument_issue_24743_matplotlib_matplotlib.txt',
                                             0.791104194745004)]}
os.path.splitext("/abc/base.txt")
| /abc/base | .txt | 
1.4. Deleting berry instances
import requests url = "https://api.berri.ai/delete_instance" instances = [ "a235f790-8609-4b75-a662-50fc7dd8bfa0", "a93e9e5c-6aef-4965-877d-ad8ef02d072c", "51d56bb7-00ae-42e5-b452-1d14bfb6310c", "10fc52c4-ed2e-4c4a-9184-57f9e4c21f03", "ac160ec7-7660-472d-bdb3-b35b3742449d", "9f5d7028-3644-4874-8dda-eae2cbdb2bb5", "1a387ee8-aade-4cc5-b556-af6405660329", "cd991ec9-92d5-4054-8afc-fc440e3519cf", "1cae75f4-809e-4073-8cb6-516f7ad27748", "a70faff0-bf20-4565-a181-4a189e4c4505", ] for instance_id in instances: params = { "user_email": "cpmdump@gmail.com", "instance_id": instance_id, } print(f"Deleting {instance_id}") response = requests.post(url, params=params) print(response.text)
Deleting a235f790-8609-4b75-a662-50fc7dd8bfa0
{"message":"Instance a235f790-8609-4b75-a662-50fc7dd8bfa0 deleted successfully","status":"success"}
Deleting a93e9e5c-6aef-4965-877d-ad8ef02d072c
{"message":"Instance a93e9e5c-6aef-4965-877d-ad8ef02d072c deleted successfully","status":"success"}
Deleting 51d56bb7-00ae-42e5-b452-1d14bfb6310c
{"message":"Instance 51d56bb7-00ae-42e5-b452-1d14bfb6310c deleted successfully","status":"success"}
Deleting 10fc52c4-ed2e-4c4a-9184-57f9e4c21f03
{"message":"Instance 10fc52c4-ed2e-4c4a-9184-57f9e4c21f03 deleted successfully","status":"success"}
Deleting ac160ec7-7660-472d-bdb3-b35b3742449d
{"message":"Instance ac160ec7-7660-472d-bdb3-b35b3742449d deleted successfully","status":"success"}
Deleting 9f5d7028-3644-4874-8dda-eae2cbdb2bb5
{"message":"Instance 9f5d7028-3644-4874-8dda-eae2cbdb2bb5 deleted successfully","status":"success"}
Deleting 1a387ee8-aade-4cc5-b556-af6405660329
{"message":"Instance 1a387ee8-aade-4cc5-b556-af6405660329 deleted successfully","status":"success"}
Deleting cd991ec9-92d5-4054-8afc-fc440e3519cf
{"message":"Instance cd991ec9-92d5-4054-8afc-fc440e3519cf deleted successfully","status":"success"}
Deleting 1cae75f4-809e-4073-8cb6-516f7ad27748
{"message":"Instance 1cae75f4-809e-4073-8cb6-516f7ad27748 deleted successfully","status":"success"}
Deleting a70faff0-bf20-4565-a181-4a189e4c4505
{"message":"Instance a70faff0-bf20-4565-a181-4a189e4c4505 deleted successfully","status":"success"}