Skip to content

Commit

Permalink
history
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisclark committed Apr 25, 2024
1 parent 891310d commit 9a81ca3
Show file tree
Hide file tree
Showing 16 changed files with 314 additions and 185 deletions.
8 changes: 8 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ Change Log
This document records all notable changes to `django-sql-explorer <https://github.com/chrisclark/django-sql-explorer>`_.
This project adheres to `Semantic Versioning <https://semver.org/>`_.

`4.1.0b1`_ (2024-04-25)
===========================
* `#609`_: Tracking should be opt-in and not use the SECRET_KEY
* `#610`_: Import error (sql_metadata) with 4.1 version
* `#612`_: Accessing the database during app initialization
* Regex-injection vulnerability
* Better anonymization for telemetry

`4.1.0`_ (2024-04-23)
===========================
* SQL Assistant: Built in query help via OpenAI (or LLM of choice), with relevant schema
Expand Down
6 changes: 3 additions & 3 deletions docs/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -338,11 +338,11 @@ but a dotted path to a python view can be used
EXPLORER_NO_PERMISSION_VIEW = 'explorer.views.auth.safe_login_view_wrapper'
Anonymous Usage Stat Collection
*******************************
Anonymous Telemetry Collection
******************************

By default, anonymous usage statistics are collected. To disable this, set the following setting to False.
You can see what is being collected in tracker.py.
You can see what is being collected in telemetry.py.

.. code-block:: python
Expand Down
6 changes: 3 additions & 3 deletions explorer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
__version_info__ = {
"major": 4,
"minor": 1,
"minor": 2,
"patch": 0,
"releaselevel": "final",
"serial": 0
"releaselevel": "beta",
"serial": 1
}


Expand Down
7 changes: 4 additions & 3 deletions explorer/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@ def _package(queries):
is_one = len(queries) == 1
name_root = lambda n: f"attachment; filename={n}" # noqa
ret["content_type"] = (is_one and "text/csv") or "application/zip"

formatted = queries[0].title.replace(",", "")
day = date.today()
ret["filename"] = (
is_one and name_root("%s.csv" % queries[0].title.replace(",", ""))
) or name_root("Report_%s.zip" % date.today())
is_one and name_root(f"{formatted}.csv")
) or name_root(f"Report_{day}.zip")

ret["data"] = (
is_one and CSVExporter(queries[0]).get_output()
Expand Down
24 changes: 0 additions & 24 deletions explorer/apps.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from django.apps import AppConfig
from django.core.exceptions import ImproperlyConfigured
from django.db import connections as djcs
from django.db.utils import DatabaseError
from django.utils.translation import gettext_lazy as _


Expand All @@ -15,7 +14,6 @@ def ready(self):
from explorer.schema import build_async_schemas
_validate_connections()
build_async_schemas()
track_summary_stats()


def _get_default():
Expand Down Expand Up @@ -44,25 +42,3 @@ def _validate_connections():
f"EXPLORER_CONNECTIONS contains ({name}, {conn_name}), "
f"but {conn_name} is not a valid Django DB connection."
)


def track_summary_stats():
from explorer.tracker import Stat, StatNames
from explorer.tracker import gather_summary_stats
from explorer.models import Query

# Django doesn't actually have a way of running code on application initialization, so we have come up with this.
# The app.ready() method (the call site for this function) is invoked *before* any migrations are run. So if were
# to just call this function in ready(), without the try: block, then it would always fail the very first time
# Django runs (and e.g. in test runs) because no tables have yet been created. The intuitive way to handle this with
# Django would be to tie into the post_migrate signal in ready() and run this function on post_migrate. But that
# doesn't work because that signal is only called if indeed a migrations has been applied. If the app restarts and
# there are no new migrations, the signal never fires. So instead we check if the Query table exists, and if it
# does, we're good to gather stats.
try:
Query.objects.first()
except DatabaseError:
return
else:
payload = gather_summary_stats()
Stat(StatNames.STARTUP_STATS, payload).track()
7 changes: 3 additions & 4 deletions explorer/assistant/utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
from explorer import app_settings
from explorer.schema import schema_info
from explorer.utils import get_valid_connection
from sql_metadata import Parser
from django.db.utils import OperationalError

if app_settings.EXPLORER_AI_API_KEY:
import tiktoken
from openai import OpenAI

OPENAI_MODEL = app_settings.EXPLORER_ASSISTANT_MODEL["name"]
ROW_SAMPLE_SIZE = 2


def openai_client():
from openai import OpenAI
return OpenAI(
api_key=app_settings.EXPLORER_AI_API_KEY,
base_url=app_settings.EXPLORER_ASSISTANT_BASE_URL
Expand Down Expand Up @@ -73,6 +70,7 @@ def format_rows_from_table(rows):


def get_table_names_from_query(sql):
from sql_metadata import Parser
if sql:
try:
parsed = Parser(sql)
Expand All @@ -84,6 +82,7 @@ def get_table_names_from_query(sql):

def num_tokens_from_string(string: str) -> int:
"""Returns the number of tokens in a text string."""
import tiktoken
encoding = tiktoken.encoding_for_model(OPENAI_MODEL)
num_tokens = len(encoding.encode(string))
return num_tokens
Expand Down
2 changes: 1 addition & 1 deletion explorer/assistant/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from django.views.decorators.http import require_POST
import json

from explorer.tracker import Stat, StatNames
from explorer.telemetry import Stat, StatNames
from explorer.utils import get_valid_connection
from explorer.assistant.models import PromptLog
from explorer.assistant.prompts import primary_prompt
Expand Down
21 changes: 21 additions & 0 deletions explorer/migrations/0015_explorervalue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Generated by Django 4.2.8 on 2024-04-25 13:34

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('explorer', '0014_promptlog'),
]

operations = [
migrations.CreateModel(
name='ExplorerValue',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('key', models.CharField(choices=[('UUID', 'Install Unique ID'), ('SMLS', 'Startup metric last send')], max_length=5)),
('value', models.TextField(blank=True, null=True)),
],
),
]
50 changes: 49 additions & 1 deletion explorer/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
from time import time
import uuid

from django.conf import settings
from django.core.exceptions import ValidationError
Expand All @@ -8,7 +9,7 @@
from django.utils.translation import gettext_lazy as _

from explorer import app_settings
from explorer.tracker import Stat, StatNames
from explorer.telemetry import Stat, StatNames
from explorer.utils import (
extract_params, get_params_for_url, get_s3_bucket, get_valid_connection, passes_blacklist, s3_url,
shared_dict_update, swap_params,
Expand Down Expand Up @@ -393,3 +394,50 @@ def stats(self):

def __str__(self):
return str(self._header)


class ExplorerValueManager(models.Manager):

def get_uuid(self):
# If blank or non-existing, generates a new UUID
uuid_obj, created = self.get_or_create(
key=ExplorerValue.INSTALL_UUID,
defaults={"value": str(uuid.uuid4())}
)
if created or uuid_obj.value is None:
uuid_obj.value = str(uuid.uuid4())
uuid_obj.save()
return uuid_obj.value

def get_startup_last_send(self):
# Stored as a Unix timestamp
try:
timestamp = self.get(key=ExplorerValue.STARTUP_METRIC_LAST_SEND).value
if timestamp:
return float(timestamp)
return None
except ExplorerValue.DoesNotExist:
return None

def set_startup_last_send(self, ts):
obj, created = self.get_or_create(
key=ExplorerValue.STARTUP_METRIC_LAST_SEND,
defaults={"value": str(ts)}
)
if not created:
obj.value = str(ts)
obj.save()


class ExplorerValue(models.Model):
INSTALL_UUID = "UUID"
STARTUP_METRIC_LAST_SEND = "SMLS"
EXPLORER_SETTINGS_CHOICES = [
(INSTALL_UUID, "Install Unique ID"),
(STARTUP_METRIC_LAST_SEND, "Startup metric last send"),
]

key = models.CharField(max_length=5, choices=EXPLORER_SETTINGS_CHOICES)
value = models.TextField(null=True, blank=True)

objects = ExplorerValueManager()
152 changes: 152 additions & 0 deletions explorer/telemetry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Anonymous usage stats
# Opt-out by setting EXPLORER_ENABLE_ANONYMOUS_STATS = False in settings

import logging
import time
import requests
import json
import threading
from enum import Enum, auto
from django.core.cache import cache
from django.db import connection
from django.db.models import Count
from django.db.migrations.recorder import MigrationRecorder
from django.conf import settings

logger = logging.getLogger(__name__)


def instance_identifier():
from explorer.models import ExplorerValue
key = "explorer_instance_identifier"
r = cache.get(key)
if not r:
r = ExplorerValue.objects.get_uuid()
cache.set(key, r, 60 * 60 * 24)
return r


class SelfNamedEnum(Enum):

@staticmethod
def _generate_next_value_(name, start, count, last_values):
return name


class StatNames(SelfNamedEnum):

QUERY_RUN = auto()
QUERY_STREAM = auto()
STARTUP_STATS = auto()
ASSISTANT_RUN = auto()


class Stat:

STAT_COLLECTION_INTERVAL = 60 * 10 # Ten minutes
STARTUP_STAT_COLLECTION_INTERVAL = 60 * 60 * 24 * 7 # A week

def __init__(self, name: StatNames, value):
self.instanceId = instance_identifier()
self.time = time.time()
self.value = value
self.name = name.value

@property
def is_summary(self):
return self.name == StatNames.STARTUP_STATS.value

def should_send_summary_stats(self):
from explorer.models import ExplorerValue
last_send = ExplorerValue.objects.get_startup_last_send()
if not last_send:
return True
else:
return self.time - last_send >= self.STARTUP_STAT_COLLECTION_INTERVAL

def send_summary_stats(self):
from explorer.models import ExplorerValue
payload = _gather_summary_stats()
Stat(StatNames.STARTUP_STATS, payload).track()
ExplorerValue.objects.set_startup_last_send(self.time)

def track(self):
from explorer import app_settings
if not app_settings.EXPLORER_ENABLE_ANONYMOUS_STATS:
return

cache_key = "last_stat_sent_time"
last_sent_time = cache.get(cache_key, 0)
# Summary stats are tracked with a different time interval
if self.is_summary or self.time - last_sent_time >= self.STAT_COLLECTION_INTERVAL:
data = json.dumps(self.__dict__)
thread = threading.Thread(target=_send, args=(data,))
thread.start()
cache.set(cache_key, self.time)

# Every time we send any tracking, see if we have recently sent overall summary stats
# Of course, sending the summary stats calls .track(), so we need to NOT call track()
# again if we are in fact already in the process of sending summary stats. Otherwise,
# we will end up in infinite recursion of track() calls.
if not self.is_summary and self.should_send_summary_stats():
self.send_summary_stats()


def _send(data):
from explorer import app_settings
try:
requests.post(app_settings.EXPLORER_COLLECT_ENDPOINT_URL,
data=data,
headers={"Content-Type": "application/json"})
except Exception as e:
logger.warning(f"Failed to send stats: {e}")


def _get_install_quarter():
first_migration = MigrationRecorder.Migration.objects. \
filter(app="explorer").order_by("applied").first()

if first_migration is not None:
quarter = (first_migration.applied.month - 1) // 3 + 1 # Calculate the quarter
year = first_migration.applied.year
quarter_str = f"Q{quarter}-{year}"
else:
quarter_str = None
return quarter_str


def _gather_summary_stats():

from explorer import app_settings
from explorer.models import Query, QueryLog
import explorer

try:
ql_stats = QueryLog.objects.aggregate(
total_count=Count("*"),
unique_run_by_user_count=Count("run_by_user_id", distinct=True)
)

q_stats = Query.objects.aggregate(
total_count=Count("*"),
unique_connection_count=Count("connection", distinct=True)
)

# Round the counts to provide additional anonymity
return {
"total_log_count": round(ql_stats["total_count"] * 0.1) * 10,
"unique_run_by_user_count": round(ql_stats["unique_run_by_user_count"] * 0.2) * 5,
"total_query_count": round(q_stats["total_count"] * 0.1) * 10,
"unique_connection_count": round(q_stats["unique_connection_count"] * 0.2) * 5,
"default_database": connection.vendor,
"explorer_install_quarter": _get_install_quarter(),
"debug": settings.DEBUG,
"tasks_enabled": app_settings.ENABLE_TASKS,
"unsafe_rendering": app_settings.UNSAFE_RENDERING,
"transform_count": len(app_settings.EXPLORER_TRANSFORMS),
"assistant_enabled": app_settings.EXPLORER_AI_API_KEY is not None,
"version": explorer.get_version(),
"charts_enabled": app_settings.EXPLORER_CHARTS_ENABLED
}
except Exception as e:
return {"error": f"error gathering stats: {e}"}
Loading

0 comments on commit 9a81ca3

Please sign in to comment.