commit 79eac02dbfa4191688f6d7730bd5cb2eb3a582ed Author: Job Becht Date: Thu Apr 6 10:51:24 2023 +0200 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f7275bb --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +venv/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..ec17583 --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +# Run examples with Visual Studio Code +## Install Code and Python +* Install Visual Studio Code: https://code.visualstudio.com/download +* Install latest Python: https://www.python.org/downloads/ +## Run script +* Open folder containing scripts +* Confirm that you trust the author so the scripts can be run +* Install Python extension if Code asks +* Create virtual environment (Ctrl+Shift+P Python: Create Environment) +* Open terminal to install dependencies (Ctrl+Shift+P Terminal: Create New Terminal) + * Windows: if an error regarding about Execution Policies occurs, change default terminal to Command Prompt instead of PowerShell: File -> Preferences -> Settings -> Features -> Terminal -> Integrated -> Default Profile: Windows -> Select `Command Prompt` + * In terminal: `pip install -r requirements-{operating_system}.txt` + * Windows + Python 3.11: If `wordcloud` cannot be installed, try installing wheel from here https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud (`pip install wordcloud-1.8.1-cp311-cp311-win_amd64.whl`) + * Linux/MacOS - ignore pywin32 dependency - remove from requirements.txt +* Replace `hansken_host` variable with the ip of your host +* Verify if the scripts contain `Run Cell` options -> Download ipykernel package if prompted +* Select default Python installation to use as kernel + +* Linux: If `:7: UserWarning: Matplotlib is currently using agg;, which is a non-GUI backend, so cannot show the figure.` this error pops up: + * update imports to explicitly use a gui backend for matplotlib: `matplotlib.use('tkAgg')` + * install tkinter `pip install tkinter` diff --git a/hansken_facet_heatmap.py b/hansken_facet_heatmap.py new file mode 100755 index 0000000..4764cc9 --- /dev/null +++ b/hansken_facet_heatmap.py @@ -0,0 +1,62 @@ +# %% [markdown] +# Plot searches over time + +## Initialize Hansken connection +import sys +import pandas as pd + +from types import SimpleNamespace +from matplotlib import pyplot +import seaborn as sns +from matplotlib.colors import LogNorm, Normalize + +from hansken.connect import connect_project +from hansken.query import RangeFacet +# %% [python] + +# setup Hansken project context + +# if you want a context in a Hansken with authentication +# +# context = connect_project( +# # endpoint='https://gatekeeper01.prod.hansken.holmes.nl/gatekeeper/', +# # # the keystore REST endpoint when this script was exported, note that +# # # this can be overridden with --keystore +# # keystore='https://keystore01.prod.hansken.holmes.nl/keystore/', +# # # the project id of your project +# # project='d42bd9c3-63db-474c-a36f-b87e1eb9e2d3', +# # interactive=True) + +# Hansken SDK running on localhost + +context = connect_project(endpoint='http://localhost:9091/gatekeeper/', + project='d42bd9c3-63db-474c-a36f-b87e1eb9e2d3', + keystore='http://localhost:9090/keystore/') + +# %% + +# Perform facet search in Hansken accross dates and present results in a heatmap + +start = '2022-7-1T00:00Z' +end = '2022-7-31T23:59Z' +search_query = "type:chatMessage" + +# Group the number of searches by the accessedOn property on a scale of a day. A Facet on a date requires a min and max +facet = RangeFacet('dates', scale='hour', min=start, max=end) + +# Create a dataframe with entries per hour for the period indicated by start and end +df = pd.DataFrame() +df['Time'] = pd.date_range(start,end,freq='1H') +df['Count'] = 0 +df.set_index('Time',inplace=True) + +# Perform search using the facet +with context.search(search_query, facets=facet, count=0 ) as searchResult: + for _, result in searchResult.facets[0].items(): + df.loc[pd.to_datetime(result.value),'Count']=result.count + +# So that we can pivot and prepare a dataframe for our heatmap +df_map = pd.pivot_table( df, fill_value=0.0, columns=df.index.date, index=df.index.hour, aggfunc="sum")['Count'] + +sns.heatmap(df_map, cmap="Greens",norm=LogNorm()) + diff --git a/query_word_cloud.py b/query_word_cloud.py new file mode 100644 index 0000000..651d0cb --- /dev/null +++ b/query_word_cloud.py @@ -0,0 +1,41 @@ +# %% [python] +import sys +from wordcloud import WordCloud, STOPWORDS +from types import SimpleNamespace + +import matplotlib.pyplot as plt + +from hansken.connect import connect_project + +# setup hansken connection +in_browser = 'js' in sys.modules +hansken_host = '' +context = connect_project(endpoint=f'http://{hansken_host}:9091/gatekeeper/', + project='5ee273fd-0978-4a0a-b8b0-2af2f8479214', + keystore=f'http://{hansken_host}:9091/keystore/', + # Authentication is faked if we run in the browser, + # because an authenticated session should already be present + auth=SimpleNamespace() if in_browser else None, + interactive=True) + +# %% [markdown] +### Collect words +# The cell below searches for all `chatMessage` traces in the current project. The `chatMessage.message` property contains the actual message. All found messages are concatenated in a single long string. +# %% [python] +words = "" +with context.search("type:chatMessage") as searchResult: + for result in searchResult: + message = result.get("chatMessage.message") + if message is not None: + words += " " + message +words +# %% [markdown] +### Draw Wordcloud +# The cell below draws a wordcloud using the words occurring in the messages. `STOPWORDS` is used to ignore common english words. +# %% [python] +# draw word cloud +wc = WordCloud(stopwords=STOPWORDS, width=600, height=400).generate(words) +plt.figure(figsize=(20, 6)) +plt.imshow(wc, interpolation="bilinear") +plt.axis("off") +plt.show() diff --git a/requirements-linux.txt b/requirements-linux.txt new file mode 100644 index 0000000..941a207 --- /dev/null +++ b/requirements-linux.txt @@ -0,0 +1,59 @@ +asttokens==2.2.1 +attrs==22.2.0 +backcall==0.2.0 +certifi==2022.12.7 +charset-normalizer==3.1.0 +colorama==0.4.6 +comm==0.1.2 +contourpy==1.0.7 +cycler==0.11.0 +debugpy==1.6.6 +decorator==5.1.1 +executing==1.2.0 +fastjsonschema==2.16.3 +fonttools==4.39.2 +hansken==2023.3.6 +idna==3.4 +ijson==3.2.0.post0 +ipykernel==6.21.3 +ipython==8.11.0 +iso8601==1.1.0 +jedi==0.18.2 +jsonschema==4.17.3 +jupyter_client==8.0.3 +jupyter_core==5.3.0 +kiwisolver==1.4.4 +Logbook==1.5.3 +matplotlib==3.7.1 +matplotlib-inline==0.1.6 +more-itertools==9.1.0 +nbformat==5.7.3 +nest-asyncio==1.5.6 +numpy==1.24.2 +packaging==23.0 +pandas==1.5.3 +parso==0.8.3 +pickleshare==0.7.5 +Pillow==9.4.0 +platformdirs==3.1.1 +plotly==5.13.1 +prompt-toolkit==3.0.38 +psutil==5.9.4 +pure-eval==0.2.2 +Pygments==2.14.0 +pyparsing==3.0.9 +pyrsistent==0.19.3 +python-dateutil==2.8.2 +pytz==2022.7.1 +pyzmq==25.0.1 +requests==2.28.2 +six==1.16.0 +squarify==0.4.3 +stack-data==0.6.2 +tabulate==0.9.0 +tenacity==8.2.2 +tornado==6.2 +traitlets==5.9.0 +urllib3==1.26.15 +wcwidth==0.2.6 +wordcloud==1.8.1 diff --git a/requirements-macos.txt b/requirements-macos.txt new file mode 100644 index 0000000..d729083 --- /dev/null +++ b/requirements-macos.txt @@ -0,0 +1,60 @@ +asttokens==2.2.1 +attrs==22.2.0 +backcall==0.2.0 +certifi==2022.12.7 +charset-normalizer==3.1.0 +colorama==0.4.6 +comm==0.1.2 +contourpy==1.0.7 +cycler==0.11.0 +debugpy==1.6.6 +decorator==5.1.1 +executing==1.2.0 +fastjsonschema==2.16.3 +fonttools==4.39.2 +hansken==2023.3.6 +idna==3.4 +ijson==3.2.0.post0 +ipykernel==6.21.3 +ipython==8.11.0 +iso8601==1.1.0 +jedi==0.18.2 +jsonschema==4.17.3 +jupyter_client==8.0.3 +jupyter_core==5.3.0 +kiwisolver==1.4.4 +Logbook==1.5.3 +matplotlib==3.7.1 +matplotlib-inline==0.1.6 +more-itertools==9.1.0 +nbformat==5.7.3 +nest-asyncio==1.5.6 +numpy==1.24.2 +packaging==23.0 +pandas==1.5.3 +parso==0.8.3 +pickleshare==0.7.5 +Pillow==9.4.0 +platformdirs==3.1.1 +plotly==5.13.1 +prompt-toolkit==3.0.38 +psutil==5.9.4 +pure-eval==0.2.2 +Pygments==2.14.0 +pyparsing==3.0.9 +pyrsistent==0.19.3 +python-dateutil==2.8.2 +pytz==2022.7.1 +# pywin32==305 +pyzmq==25.0.1 +requests==2.28.2 +six==1.16.0 +squarify==0.4.3 +stack-data==0.6.2 +tabulate==0.9.0 +tenacity==8.2.2 +tornado==6.2 +traitlets==5.9.0 +urllib3==1.26.15 +wcwidth==0.2.6 +#wordcloud==1.8.1 diff --git a/requirements-windows.txt b/requirements-windows.txt new file mode 100644 index 0000000..19b3cb1 --- /dev/null +++ b/requirements-windows.txt @@ -0,0 +1,60 @@ +asttokens==2.2.1 +attrs==22.2.0 +backcall==0.2.0 +certifi==2022.12.7 +charset-normalizer==3.1.0 +colorama==0.4.6 +comm==0.1.2 +contourpy==1.0.7 +cycler==0.11.0 +debugpy==1.6.6 +decorator==5.1.1 +executing==1.2.0 +fastjsonschema==2.16.3 +fonttools==4.39.2 +hansken==2023.3.6 +idna==3.4 +ijson==3.2.0.post0 +ipykernel==6.21.3 +ipython==8.11.0 +iso8601==1.1.0 +jedi==0.18.2 +jsonschema==4.17.3 +jupyter_client==8.0.3 +jupyter_core==5.3.0 +kiwisolver==1.4.4 +Logbook==1.5.3 +matplotlib==3.7.1 +matplotlib-inline==0.1.6 +more-itertools==9.1.0 +nbformat==5.7.3 +nest-asyncio==1.5.6 +numpy==1.24.2 +packaging==23.0 +pandas==1.5.3 +parso==0.8.3 +pickleshare==0.7.5 +Pillow==9.4.0 +platformdirs==3.1.1 +plotly==5.13.1 +prompt-toolkit==3.0.38 +psutil==5.9.4 +pure-eval==0.2.2 +Pygments==2.14.0 +pyparsing==3.0.9 +pyrsistent==0.19.3 +python-dateutil==2.8.2 +pytz==2022.7.1 +pywin32==305 +pyzmq==25.0.1 +requests==2.28.2 +six==1.16.0 +squarify==0.4.3 +stack-data==0.6.2 +tabulate==0.9.0 +tenacity==8.2.2 +tornado==6.2 +traitlets==5.9.0 +urllib3==1.26.15 +wcwidth==0.2.6 +wordcloud==1.8.1 diff --git a/searches_time.py b/searches_time.py new file mode 100644 index 0000000..f450feb --- /dev/null +++ b/searches_time.py @@ -0,0 +1,54 @@ +# %% [markdown] +## Plot searches over time + +### Initialize Hansken connection +# Replace `hansken_host` with the ip of a Hansken instance. + +# %% [python] +import sys +import pandas as pd + +from types import SimpleNamespace +from matplotlib import pyplot + +from hansken.connect import connect_project +from hansken.query import RangeFacet + +# The line below finds out if we run in the browser by checking for the js module +in_browser = 'js' in sys.modules +hansken_host = '' +context = connect_project(endpoint=f'http://{hansken_host}:9091/gatekeeper/', + project='5ee273fd-0978-4a0a-b8b0-2af2f8479214', + keystore=f'http://{hansken_host}:9091/keystore/', + # Authentication is faked if we run in the browser, + # because an authenticated session should already be present + auth=SimpleNamespace() if in_browser else None, + interactive=True) + +# %% [markdown] +### Aggregate browser history data +# The cell below retrieves the browser activity from Hansken. We use a `Facet` to count the number of traces where the `accessedOn` property is within a specific day. +# %% [python] +# Group the number of searches by the accessedOn property on a scale of a day. A Facet on a date requires a min and max +facet = RangeFacet('browserHistory.accessedOn', scale='day', min="2022-01-01", max="2023-01-01") +# Perform search using the facet, set count=0 to prevent hansken returning traces +with context.search("browserHistory.accessedOn=2022", facets=facet, count=0) as searchResult: + # Convert to dataframe + dateFacetResult = searchResult.facets[0] + df = pd.DataFrame([[counter.value, counter.count] for _, counter in searchResult.facets[0].items()], columns=['Day', 'Count']) +# make sure pandas knows this is a timestamp +df['Day'] = pd.to_datetime(df['Day']) +df + +# %% [markdown] +### Plot the results +# The cell below uses `pyplot` to create a bar chart using the previous information, plotting the number of traces/day. + +# %% [python] +# Plot results +fig, ax = pyplot.subplots(figsize=(10, 6)) +ax.bar(df['Day'], df['Count']) +ax.set_xlabel("day") +ax.set_ylabel("count") +ax.set_title('') +pyplot.show() diff --git a/types_in_piechart.py b/types_in_piechart.py new file mode 100644 index 0000000..78e35dc --- /dev/null +++ b/types_in_piechart.py @@ -0,0 +1,40 @@ +# %% [markdown] +## Show the distribution of different trace types within a Hansken project +### Setup Hansken connection + +# %% [python] +import sys +import plotly.express as px +from types import SimpleNamespace + +from hansken.connect import connect_project +from hansken.query import TermFacet + +in_browser = 'js' in sys.modules +hansken_host = '' +project = 'd42bd9c3-63db-474c-a36f-b87e1eb9e2d3' +context = connect_project(endpoint=f'http://{hansken_host}:9091/gatekeeper/', + project=project, + keystore=f'http://{hansken_host}:9091/keystore/', + # Authentication is faked if we run in the browser, + # because an authenticated session should already be present + auth=SimpleNamespace() if in_browser else None, + interactive=True) + +# %% [markdown] +### Retrieve different trace types from Hansken. +# A `Facet` query is used to count the different types of traces. This `Facet` returns the 40 most common type of traces for a given query. We query for all traces (`*`), so this will return the most common types within the entire project. + +# %% [python] +facet = TermFacet('type', size=40) +# Perform search using the facet, set count=0 to prevent hansken returning traces +with context.search("*", facets=facet, count=0) as searchResult: + # ignore origin because it is a metatype and compressed to limit the total number of types + ignoreable_types = {'origin', 'compressed'} + typeFacet = [bucket for bucket in searchResult.facets[0].values() + if bucket.value not in ignoreable_types] + counts = [bucket.count for bucket in typeFacet] + names = [bucket.value for bucket in typeFacet] + +fig = px.pie(values=counts, names=names, title=f'Trace types found in project {project}') +fig.show() diff --git a/unique_values_treemap_chatmessage.py b/unique_values_treemap_chatmessage.py new file mode 100644 index 0000000..c2e5c79 --- /dev/null +++ b/unique_values_treemap_chatmessage.py @@ -0,0 +1,41 @@ +# %% [markdown] +## Plot the distribution of senders of chat messages +### Setup Hansken connection +# %% [python] +import sys +import squarify +from types import SimpleNamespace +import matplotlib.pyplot as plt + +from hansken.connect import connect_project + +in_browser = 'js' in sys.modules +hansken_host = '' +context = connect_project(endpoint=f'http://{hansken_host}:9091/gatekeeper/', + project='5ee273fd-0978-4a0a-b8b0-2af2f8479214', + keystore=f'http://{hansken_host}:9091/keystore/', + # Authentication is faked if we run in the browser, + # because an authenticated session should already be present + auth=SimpleNamespace() if in_browser else None, + interactive=True) + +# %% [markdown] +### Retrieve all senders +# The `unique_values` function returns all values for a given property within a project. In this case, we retrieve all values for `chatMessage.from`. + +# %% [python] +sizes = [] +labels = [] +for sender in context.unique_values("chatMessage.from"): + sizes.append(sender['count']) + labels.append(sender['value']) + +# %% [markdown] +### Use a treemap visualization to plot the distribution of senders. + +# %% [python] +fig = plt.figure(figsize=(12,6)) +ax = fig.add_subplot(111) +squarify.plot(sizes=sizes, label=labels, alpha=.6, ax=ax) +plt.axis('off') +plt.show()