diff --git a/01 query_word_cloud.py b/01 query_word_cloud.py index d8bebdb..32f23ec 100644 --- a/01 query_word_cloud.py +++ b/01 query_word_cloud.py @@ -11,32 +11,36 @@ from hansken.connect import connect_project in_browser = 'js' in sys.modules hansken_host = '' context = connect_project(endpoint=f'http://{hansken_host}:9091/gatekeeper/', - project='5ee273fd-0978-4a0a-b8b0-2af2f8479214', - keystore=f'http://{hansken_host}:9091/keystore/', - # Authentication is faked if we run in the browser, - # because an authenticated session should already be present - auth=SimpleNamespace() if in_browser else None, - interactive=True) - + project='5ee273fd-0978-4a0a-b8b0-2af2f8479214', + keystore=f'http://{hansken_host}:9091/keystore/', + # Authentication is faked if we run in the browser, + # because an authenticated session should already be present + auth=SimpleNamespace() if in_browser else None, + interactive=True) + # Hansken SDK running on localhost # context = connect_project(endpoint='http://localhost:9091/gatekeeper/', # project='d42bd9c3-63db-474c-a36f-b87e1eb9e2d3', # keystore='http://localhost:9090/keystore/') + # %% [markdown] ### Collect words # The cell below searches for all `chatMessage` traces in the current project. The `chatMessage.message` property contains the actual message. All found messages are concatenated in a single long string. + # %% [python] words = "" -with context.search("type:chatMessage") as searchResult: - for result in searchResult: - message = result.get("chatMessage.message") - if message is not None: - words += " " + message +with context.search("type:chatMessage") as search_result: + for result in search_result: + message = result.get("chatMessage.message") + if message is not None: + words += " " + message words + # %% [markdown] ### Draw Wordcloud # The cell below draws a wordcloud using the words occurring in the messages. `STOPWORDS` is used to ignore common english words. + # %% [python] # draw word cloud wc = WordCloud(stopwords=STOPWORDS, width=600, height=400).generate(words) diff --git a/02 searches_time.py b/02 searches_time.py index e9dcbc6..13d211a 100644 --- a/02 searches_time.py +++ b/02 searches_time.py @@ -35,10 +35,11 @@ context = connect_project(endpoint=f'http://{hansken_host}:9091/gatekeeper/', # Group the number of searches by the accessedOn property on a scale of a day. A Facet on a date requires a min and max facet = RangeFacet('browserHistory.accessedOn', scale='day', min="2022-01-01", max="2023-01-01") # Perform search using the facet, set count=0 to prevent hansken returning traces -with context.search("browserHistory.accessedOn=2022", facets=facet, count=0) as searchResult: - # Convert to dataframe - dateFacetResult = searchResult.facets[0] - df = pd.DataFrame([[counter.value, counter.count] for _, counter in searchResult.facets[0].items()], columns=['Day', 'Count']) +with context.search("browserHistory.accessedOn=2022", facets=facet, count=0) as search_result: + # Convert to dataframe + dateFacetResult = search_result.facets[0] + df = pd.DataFrame([[counter.value, counter.count] for _, counter in search_result.facets[0].items()], + columns=['Day', 'Count']) # make sure pandas knows this is a timestamp df['Day'] = pd.to_datetime(df['Day']) df diff --git a/03 hansken_facet_heatmap.py b/03 hansken_facet_heatmap.py index ff6d646..a729a83 100644 --- a/03 hansken_facet_heatmap.py +++ b/03 hansken_facet_heatmap.py @@ -1,17 +1,17 @@ # %% [markdown] # Plot searches over time -## Initialize Hansken connection import sys import pandas as pd from types import SimpleNamespace -from matplotlib import pyplot +from matplotlib import pyplot as plt import seaborn as sns -from matplotlib.colors import LogNorm, Normalize +from matplotlib.colors import LogNorm from hansken.connect import connect_project from hansken.query import RangeFacet + # %% [python] # setup Hansken project context @@ -39,7 +39,7 @@ context = connect_project(endpoint=f'http://{hansken_host}:9091/gatekeeper/', start = '2022-7-1T00:00Z' end = '2022-7-31T23:59Z' -#search_query = "type:chatMessage" +# search_query = "type:chatMessage" search_query = "type:browserHistory" # Group the number of searches by the accessedOn property on a scale of a day. A Facet on a date requires a min and max @@ -47,19 +47,19 @@ facet = RangeFacet('dates', scale='hour', min=start, max=end) # Create a dataframe with entries per hour for the period indicated by start and end df = pd.DataFrame() -df['Time'] = pd.date_range(start,end,freq='1H') +df['Time'] = pd.date_range(start, end, freq='1H') df['Count'] = 0 -df.set_index('Time',inplace=True) +df.set_index('Time', inplace=True) # Perform search using the facet -with context.search(search_query, facets=facet, count=0 ) as searchResult: - for _, result in searchResult.facets[0].items(): - df.loc[pd.to_datetime(result.value),'Count']=result.count +with context.search(search_query, facets=facet, count=0) as search_result: + for _, result in search_result.facets[0].items(): + df.loc[pd.to_datetime(result.value), 'Count'] = result.count # So that we can pivot and prepare a dataframe for our heatmap -df_map = pd.pivot_table( df, fill_value=0.0, columns=df.index.date, index=df.index.hour, aggfunc="sum")['Count'] - -sns.heatmap(df_map, cmap="Greens",norm=LogNorm()) +df_map = pd.pivot_table(df, fill_value=0.0, columns=df.index.date, index=df.index.hour, aggfunc="sum")['Count'] +sns.heatmap(df_map, cmap="Greens", norm=LogNorm()) +plt.show() # %% diff --git a/04 types_in_piechart.py b/04 types_in_piechart.py index 15e437e..ccb7c32 100644 --- a/04 types_in_piechart.py +++ b/04 types_in_piechart.py @@ -35,13 +35,13 @@ context = connect_project(endpoint=f'http://{hansken_host}:9091/gatekeeper/', # %% [python] facet = TermFacet('type', size=40) # Perform search using the facet, set count=0 to prevent hansken returning traces -with context.search("*", facets=facet, count=0) as searchResult: +with context.search("*", facets=facet, count=0) as search_result: # ignore origin because it is a metatype and compressed to limit the total number of types ignoreable_types = {'origin', 'compressed'} - typeFacet = [bucket for bucket in searchResult.facets[0].values() - if bucket.value not in ignoreable_types] - counts = [bucket.count for bucket in typeFacet] - names = [bucket.value for bucket in typeFacet] + type_facet = [bucket for bucket in search_result.facets[0].values() + if bucket.value not in ignoreable_types] + counts = [bucket.count for bucket in type_facet] + names = [bucket.value for bucket in type_facet] fig = px.pie(values=counts, names=names, title=f'Trace types found in project') fig.show() diff --git a/05 unique_values_treemap_chatmessage.py b/05 unique_values_treemap_chatmessage.py index 39266c1..8a061c0 100644 --- a/05 unique_values_treemap_chatmessage.py +++ b/05 unique_values_treemap_chatmessage.py @@ -1,6 +1,7 @@ # %% [markdown] ## Plot the distribution of senders of chat messages ### Setup Hansken connection + # %% [python] import sys import squarify @@ -29,7 +30,8 @@ context = connect_project(endpoint=f'http://{hansken_host}:9091/gatekeeper/', # %% [markdown] ### Retrieve all senders -# The `unique_values` function returns all values for a given property within a project. In this case, we retrieve all values for `chatMessage.from`. +# The `unique_values` function returns all values and the number of occurrences for a given property within a project. +# In this case, we retrieve all values for `chatMessage.from`. # %% [python] sizes = [] @@ -42,7 +44,7 @@ for sender in context.unique_values("chatMessage.from"): ### Use a treemap visualization to plot the distribution of senders. # %% [python] -fig = plt.figure(figsize=(12,6)) +fig = plt.figure(figsize=(12, 6)) ax = fig.add_subplot(111) squarify.plot(sizes=sizes, label=labels, alpha=.6, ax=ax) plt.axis('off')